This article is from netease Cloud community

Author: Wang Tao

Here we give some examples of common code, including GET, POST (JSON, forms), access with certificates: GET request

def fetch_url():
        c = CurlAsyncHTTPClient()  Define an HttpClient
        myheaders = {
            "Host": ""."Connection": "keep-alive"."Cache-Control": "max-age=0"."Upgrade-Insecure-Requests": "1"."User-Agent": "Mozilla / 5.0 (Windows; U; Windows NT 6.1; En-us) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/ Safari/532.5"."Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8"."Accept-Encoding": "gzip, deflate"."Accept-Language": "zh-CN,zh; Q = 0.9, en. Q = 0.8"
        url = " _ ="

        req = HTTPRequest(url=url, method="GET", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
        response = yield c.fetch(req)  Make a request
        print response.code
        print response.body
        IOLoop.current().stop()  Stop the ioloop thread
        print traceback.format_exc()Copy the code

Packet request header caught by Fiddler:

POST JSON data request

def fetch_url():
    """Grab url"""
        c = CurlAsyncHTTPClient()  Define an HttpClient
        myheaders = {
            "Host": ""."Connection": "keep-alive"."Cache-Control": "max-age=0"."Upgrade-Insecure-Requests": "1"."User-Agent": "Mozilla / 5.0 (Windows; U; Windows NT 6.1; En-us) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/ Safari/532.5"."Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8"."Accept-Encoding": "gzip, deflate"."Content-Type": "Application/json"."Accept-Language": "zh-CN,zh; Q = 0.9, en. Q = 0.8"
        url = "Http:// type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
        body =json.dumps({"key1": "value1"."key2": "value2"})  # Json format data

        req = HTTPRequest(url=url, method="POST", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
        response = yield c.fetch(req)  Make a request
        print response.code
        print response.body
        IOLoop.current().stop()  Stop the ioloop thread
        print traceback.format_exc()Copy the code

Packet request header caught by Fiddler:

POST Form data request

def fetch_url():
    """Grab url"""
        c = CurlAsyncHTTPClient()  Define an HttpClient
        myheaders = {
            "Host": ""."Connection": "keep-alive"."Cache-Control": "max-age=0"."Upgrade-Insecure-Requests": "1"."User-Agent": "Mozilla / 5.0 (Windows; U; Windows NT 6.1; En-us) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/ Safari/532.5"."Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8"."Accept-Encoding": "gzip, deflate".# "Content-Type": "Application/json",
            "Accept-Language": "zh-CN,zh; Q = 0.9, en. Q = 0.8"
        import urllib
        url = "Http:// type=1&s_from=input&query=%E4%BA%BA%E6%B0%91%E6%97%A5%E6%8A%A5&ie=utf8&_sug_=n&_sug_type_="
        body =urllib.urlencode({"key1": "value1"."key2": "value2"})  Wrap the form form

        req = HTTPRequest(url=url, method="POST", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,
        response = yield c.fetch(req)  Make a request
        print response.code
        print response.body
        IOLoop.current().stop()  Stop the ioloop thread
        print traceback.format_exc()Copy the code

Packet request header caught by Fiddler:

Adding certificate Access

def fetch_url():
    """Grab url"""
        c = CurlAsyncHTTPClient()  Define an HttpClient
        myheaders = {
            "Host": ""."Connection": "keep-alive"."Cache-Control": "max-age=0"."Upgrade-Insecure-Requests": "1"."User-Agent": ("Mozilla / 5.0 (Windows NT 10.0; Win64; x64) "
                   AppleWebKit/537.36 (KHTML, like Gecko)
                   "Chrome / 68.0.3440.106 Safari / 537.36"),
            "Accept": ("text/html,application/xhtml+xml,"
               "application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8"),
            "Accept-Encoding": "gzip, deflate, br"."Accept-Language": "zh-CN,zh; Q = 0.9, en. Q = 0.8"
        import urllib
        url = ""

        req = HTTPRequest(url=url, method="GET", headers=myheaders, follow_redirects=True, request_timeout=20, connect_timeout=10,proxy_host="",
        proxy_port=8888,ca_certs="FiddlerRoot.pem")  # bind certificate
        response = yield c.fetch(req)  Make a request
        print response.code
        print response.body
        IOLoop.current().stop()  Stop the ioloop thread
        print traceback.format_exc()Copy the code

Packet caught by Fiddler (indicates normal access)


Use Requests when fetching is low. Simple and easy to use. Tornado is recommended to be used when there is a large amount of concurrency, which is highly efficient and easy to program with single thread.

The interfaces and parameters used in Requests and Fiddler are described above, which can solve most of the problems faced by crawlers, including concurrent fetching, everyday anti-crawling, and HTTPS web fetching.

Here’s a snippet of my own common fetching logic:

import randomfrom tornado.ioloop import IOLoopfrom tornado import genfrom tornado.queues import Queue import random from  tornado.ioloop import IOLoop from tornado import gen from tornado.queues import Queue TASK_QUE = Queue(maxsize=1000) def response_handler(res):"""Processing the reply typically adds the parsed new URL to the task queue and parses the target data."""

def url_fetcher_without_param():

def url_fetcher(*args,**kwargs):
    global TASK_QUE
    c = CurlAsyncHTTPClient()

    while 1:
        #console_show_log("Let's spider")
            param = TASK_QUE.get(time.time() + 300) # 5 minutes overtimeExcept, tornado. Util. TimeoutError: : yield gen. Sleep (random. Randint (10100))continue

            req = HTTPRequest(url,method=,headers=,....) Configure parameters as needed
            response = yield c.fetch(req) 
            if response.coe==200:
        except Exception:
            yield gen.sleep(10)
            print "I am a slow spider"Yield gen. Sleep (random. Randint (10,100)) @gen. Coroutine def period_callback(): pass def main(): io_loop = IOLoop.current()Add concurrency logic 1
    io_loop.spawn_callback(url_fetcher, 1)  
    io_loop.spawn_callback(url_fetcher, 2)
    io_loop.spawn_callback(url_fetcher_without_param) The parameter is optional

    If periodic calls are required, call PeriodicCallback:
    PERIOD_CALLBACK_MILSEC = 10  # 10, in ms

if __name__ == "__main__":
    main()Copy the code

Above, welcome to discuss and exchange

