The block type

import socket
from urllib.parse import urlparse


def get_url(url):
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    client.connect((host, 80))
    # Simulate HTTP protocol
    client.send("GET HTTP / 1.1 \ r \ nHost: {} {} \ r \ nConnection: close \ r \ n \ r \ n".format(path, host).encode('utf8'))
    data = b' '
    while True:
        d = client.recv(1024)
        if d:
            data += d
        else:
            break
    data = data.decode('utf8')
    html_data = data.split("\r\n\r\n") [1]Remove the request header
    print(html_data)
    client.close()

if __name__=="__main__":
    get_url("http://www.baidu.com")
Copy the code

Non-blocking because asking if the connection is established requires a while loop to check the status, which consumes extra CPU

import socket
from urllib.parse import urlparse


def get_url(url):
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == ' ':
        path = '/'

    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    client.setblocking(False)  Set to non-blocking

    try:
        client.connect((host, 80))
    except BlockingIOError as e:
        pass

    while True:
        try:
            client.send(
                'GET HTTP / 1.1 \ r \ {path} nHost: {host} \ r \ nConnection: close \ r \ n \ r \ n'.format(path=path, host=host).encode(
                    'utf8'))
            break
        except OSError as e:
            pass

    data = b' '
    while True:
        try:
            d = client.recv(1024)
        except BlockingIOError as e:
            continue

        if d:
            data += d
        else:
            break

    data = data.decode('utf8')
    html_data = data.split('\r\n\r\n') [1]print(html_data)
    client.close()


if __name__ == '__main__':
    get_url('http://www.baidu.com')

Copy the code

Select (poll/epoll) + callback + event loop (poll/epoll) + event loop (poll/epoll) Single thread concurrency, high concurrency, but this callback is too painful to write

import socket
from urllib.parse import urlparse
Select /epull is an easier package to use and will automatically select select/ EPull depending on platform Win/Linux
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE

selector = DefaultSelector()

urls = ['http://www.baidu.com']
stop = False
class Fetch:
    def connected(self, key):
        selector.unregister(key.fd) Unlog monitored events
        self.client.send('GET HTTP / 1.1 \ r \ {path} nHost: {host} \ r \ nConnection: close \ r \ n \ r \ n'.format(path=self.path, host=self.host).encode(
                    'utf8'))
        selector.register(self.client.fileno(), EVENT_READ,self.readable)

    def readable(self, key):
        d = self.client.recv(1024)
        if d:
            self.data += d
        else:
            selector.unregister(key.fd)

        data = self.data.decode('utf8')
        html_data = data.split('\r\n\r\n') [1]print(html_data)
        self.client.close()
        urls.remove(self.spider_url)
        if not urls:
            global stop
            stop = True


    def get_url(self, url):
        self.spider_url = url
        url = urlparse(url)
        self.host = url.netloc
        self.path = url.path
        self.data = b' '
        if self.path == ' ':
            self.path = '/'

        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.client.setblocking(False)

        try:
            self.client.connect((self.host, 80))
        except BlockingIOError as e:
            pass

        # registered
        selector.register(self.client.fileno(), EVENT_WRITE, self.connected)

def loop():
    The state of the socket is continuously requested and the corresponding callback function is called
    # 1. Select does not support register mode
    The callback after the socket crash is done by the programmer
    while not stop:
        ready = selector.select()
        for key, mask in ready:
            call_back = key.data
            call_back(key)



if __name__ == '__main__':
    fetcher = Fetch()
    fetcher.get_url('http://www.baidu.com')
    loop()
Copy the code

It can be said that the concurrency of synchronous mode is not high, and the encoding complexity of callback mode is high. Multithreading requires synchronization between threads, which affects the concurrency performance.