Using multithreading to accelerate crawler program

Basic code

# ok file
import requests
urls = [
	f"https://www.cnblogs.com/#p{page}"
	for page in range(1.50+1)]def craw(url) :
	r = requests.get(url)
	print(url,len(r.text))

craw(urls[0])
Copy the code

import ok
import threading
def single_thread() :
	for url in ok.urls:
		ok.craw(url)


def multi_thread() :
	threads = []
	for url in ok.urls:
		threads.append(	 threading.Thread(target = ok.craw, args=(url,)) )

	for thread in threads:
		thread.start()

	for thread in threads:
		thread.join()

if __name__ == '__main__':
	single_thread()
	multi_thread()
Copy the code

Multi-component pipeline technical architecture

ok.py

import requests
from bs4 import BeautifulSoup

urls = [
	f"https://www.cnblogs.com/#p{page}"
	for page in range(1.50+1)]def craw(url) :
	r = requests.get(url)
	return r.text

def parse(html) :
	soup = BeautifulSoup(html,'html.parser')
	links = soup.find_all("a", class_="post-item-title")
	return [(link["href"],link.get_text()) for link in links]


if __name__ == "__main__":
	for result in parse(craw(urls[2)) :print(result)
Copy the code

An updated version of the crawler

import time, random
import queue
import ok
import threading
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue) :
	while True:
		url = url_queue.get()
		html = ok.craw(url)
		html_queue.put(html)
		print(threading.current_thread().name,f"craw{url}"."url_queue.size=",url_queue.qsize())
		time.sleep(random.randint(1.2))


def do_parse(html_queue:queue.Queue,fout) :
	while True:
		html = html_queue.get()
		results = ok.parse(html)
		for result in results:
			fout.write(str(result)+"\n")
		print (threading.current_thread().name,f"craw{url}"."url_queue.size=", url_queue.qsize())
		time.sleep(random.randint(1.2))

if __name__=="__main__":
	url_queue = queue.Queue()
	html_queue = queue.Queue()
	for url in ok.urls:
		url_queue.put(url) # producers

	for idx in range(3):
		t = threading.Thread(target= do_craw,args=(url_queue,html_queue),name=f"craw{idx}")
		t.start()

	fout = open("02.txt"."w")
	for idx in range(2):
		t = threading.Thread(target=do_parse,args=(html_queue,fout),name=f"parse{idx}")
		t.start()
Copy the code

Using multithreading to accelerate crawler program

Multi-component pipeline technical architecture

Related Posts

How to test compatibility of web pages?

Reference to Offer II 023. First overlap node of two linked lists

[Miscellany – Self-blame paste] linked list reverse wrote for a long time