- Basic code
# ok file
import requests
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1.50+1)]def craw(url) :
r = requests.get(url)
print(url,len(r.text))
craw(urls[0])
Copy the code
import ok
import threading
def single_thread() :
for url in ok.urls:
ok.craw(url)
def multi_thread() :
threads = []
for url in ok.urls:
threads.append( threading.Thread(target = ok.craw, args=(url,)) )
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
single_thread()
multi_thread()
Copy the code
Multi-component pipeline technical architecture
- ok.py
import requests
from bs4 import BeautifulSoup
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1.50+1)]def craw(url) :
r = requests.get(url)
return r.text
def parse(html) :
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all("a", class_="post-item-title")
return [(link["href"],link.get_text()) for link in links]
if __name__ == "__main__":
for result in parse(craw(urls[2)) :print(result)
Copy the code
- An updated version of the crawler
import time, random
import queue
import ok
import threading
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue) :
while True:
url = url_queue.get()
html = ok.craw(url)
html_queue.put(html)
print(threading.current_thread().name,f"craw{url}"."url_queue.size=",url_queue.qsize())
time.sleep(random.randint(1.2))
def do_parse(html_queue:queue.Queue,fout) :
while True:
html = html_queue.get()
results = ok.parse(html)
for result in results:
fout.write(str(result)+"\n")
print (threading.current_thread().name,f"craw{url}"."url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1.2))
if __name__=="__main__":
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in ok.urls:
url_queue.put(url) # producers
for idx in range(3):
t = threading.Thread(target= do_craw,args=(url_queue,html_queue),name=f"craw{idx}")
t.start()
fout = open("02.txt"."w")
for idx in range(2):
t = threading.Thread(target=do_parse,args=(html_queue,fout),name=f"parse{idx}")
t.start()
Copy the code