The block type
import socket
from urllib.parse import urlparse
def get_url(url):
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.connect((host, 80))
# Simulate HTTP protocol
client.send("GET HTTP / 1.1 \ r \ nHost: {} {} \ r \ nConnection: close \ r \ n \ r \ n".format(path, host).encode('utf8'))
data = b' '
while True:
d = client.recv(1024)
if d:
data += d
else:
break
data = data.decode('utf8')
html_data = data.split("\r\n\r\n") [1]Remove the request header
print(html_data)
client.close()
if __name__=="__main__":
get_url("http://www.baidu.com")
Copy the code
Non-blocking because asking if the connection is established requires a while loop to check the status, which consumes extra CPU
import socket
from urllib.parse import urlparse
def get_url(url):
url = urlparse(url)
host = url.netloc
path = url.path
if path == ' ':
path = '/'
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.setblocking(False) Set to non-blocking
try:
client.connect((host, 80))
except BlockingIOError as e:
pass
while True:
try:
client.send(
'GET HTTP / 1.1 \ r \ {path} nHost: {host} \ r \ nConnection: close \ r \ n \ r \ n'.format(path=path, host=host).encode(
'utf8'))
break
except OSError as e:
pass
data = b' '
while True:
try:
d = client.recv(1024)
except BlockingIOError as e:
continue
if d:
data += d
else:
break
data = data.decode('utf8')
html_data = data.split('\r\n\r\n') [1]print(html_data)
client.close()
if __name__ == '__main__':
get_url('http://www.baidu.com')
Copy the code
Select (poll/epoll) + callback + event loop (poll/epoll) + event loop (poll/epoll) Single thread concurrency, high concurrency, but this callback is too painful to write
import socket
from urllib.parse import urlparse
Select /epull is an easier package to use and will automatically select select/ EPull depending on platform Win/Linux
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
selector = DefaultSelector()
urls = ['http://www.baidu.com']
stop = False
class Fetch:
def connected(self, key):
selector.unregister(key.fd) Unlog monitored events
self.client.send('GET HTTP / 1.1 \ r \ {path} nHost: {host} \ r \ nConnection: close \ r \ n \ r \ n'.format(path=self.path, host=self.host).encode(
'utf8'))
selector.register(self.client.fileno(), EVENT_READ,self.readable)
def readable(self, key):
d = self.client.recv(1024)
if d:
self.data += d
else:
selector.unregister(key.fd)
data = self.data.decode('utf8')
html_data = data.split('\r\n\r\n') [1]print(html_data)
self.client.close()
urls.remove(self.spider_url)
if not urls:
global stop
stop = True
def get_url(self, url):
self.spider_url = url
url = urlparse(url)
self.host = url.netloc
self.path = url.path
self.data = b' '
if self.path == ' ':
self.path = '/'
self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.client.setblocking(False)
try:
self.client.connect((self.host, 80))
except BlockingIOError as e:
pass
# registered
selector.register(self.client.fileno(), EVENT_WRITE, self.connected)
def loop():
The state of the socket is continuously requested and the corresponding callback function is called
# 1. Select does not support register mode
The callback after the socket crash is done by the programmer
while not stop:
ready = selector.select()
for key, mask in ready:
call_back = key.data
call_back(key)
if __name__ == '__main__':
fetcher = Fetch()
fetcher.get_url('http://www.baidu.com')
loop()
Copy the code
It can be said that the concurrency of synchronous mode is not high, and the encoding complexity of callback mode is high. Multithreading requires synchronization between threads, which affects the concurrency performance.