
I believe that every reptilian friends in life must have encountered writing half a day to write out the reptilian, finally not easy to run, the result is slow, anyway, I have encountered. Now is the era of big data, the light will write crawler is not competitive, so learn to optimize the crawler code, optimize the crawler robustness or crawl speed and so on, these can improve their competitiveness.

This article crawler to Qiushi Encyclopedia as an example, to ordinary crawler and multithreaded crawler == running time ==, I believe that we can appreciate the powerful of multithreading!!

1. Common reptile

import requests
from lxml import etree
import time
import sys

headers = {
    "User-Agent": "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"

# crawl
def Crawl(response) :
    e = etree.HTML(response.text)
   # select class by class value
    span_text = e.xpath("//div[@class='content']/span[1]")
    with open("duanzi.txt"."a", encoding="utf-8") as f:
        for span in span_text:
            info = span.xpath("string(.) ")

if __name__ == '__main__'Start = time.time() base_URL ="{}"
    for i in range(1.14) :Print the number of pages currently climbed
        print("Climbing page {}".format(i))
        new_url = base_url.format(i)
        Send a get request
        response = requests.get(new_url,headers=headers)
	Write down the end time
    end = time.time()
    Get run time by subtracting
    print(end - start)

2. Multi-threaded crawlers

import requests
from lxml import etree
First in, first out (FIFO
from queue import Queue
from threading import Thread
import time

# request header
headers = {
    "User-Agent": "Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"

# Data acquisition
# inheritance Thread
class CrawlInfo(Thread) :
    # override init function
    def __init__(self,url_queue,html_queue) :

        self.url_queue = url_queue
        self.html_queue = html_queue

    # override the run method
    def run(self) :
        while self.url_queue.empty()== False:
            url = self.url_queue.get()
            response = requests.get(url,headers=headers)
            if response.status_code == 200:
            	Put the data into the queue

# Data parsing and saving
# inheritance Thread
class ParseInfo(Thread) :
    def __init__(self,html_queue) :
        self.html_queue = html_queue

    def run(self) :
    	Check whether the queue is empty. If it is not empty, continue traversing
        while self.html_queue.empty() == False:
        	Fetch the last data from the queue
            e = etree.HTML(self.html_queue.get())
            span_text = e.xpath("//div[@class='content']/span[1]")
            with open("duanzi.txt"."a", encoding="utf-8") as f:
                for span in span_text:
                    info = span.xpath("string(.) ")

# start
if __name__ == '__main__':
    start = time.time()
    # instantiation
    url_queue = Queue()
    html_queue = Queue()
    base_url = "{}"
    for i in range(1.14) :print("Climbing page {}".format(i))
        new_url = base_url.format(i)

    crawl_list = []
    for i in range(3):
        Crawl = CrawlInfo(url_queue,html_queue)

    for crawl in crawl_list:
    	#join() waits until the queue is empty before doing anything else

    parse_list = []
    for i in range(3):
        parse = ParseInfo(html_queue)

    for parse in parse_list:

    end = time.time()
    print(end - start)

3. Operation comparison

== Common crawler ==

Multithreaded crawler ==

There may be little partner feel this few seconds it’s not a big deal, again, is now = = = = big data era, and is prone to = = = = millions, tens of millions of data quantity, if you still can only write ordinary crawler code, then in = = degree spell without winning the = =, you even spell doesn’t win the = = = = page technology, That’s what makes it so sad!!

