One is to crawl the minimum parent tag of all the above information, which contains the information required above, and then extract the title, rating, introduction and link address of each parent tag in turn;
The other is to crawl all the titles, all the ratings, all the profiles, all the links, and put them in order. Let’s take the first approach this time.
import re import requests import time import random from openpyxl import workbook,load_workbook from bs4 import BeautifulSoup def getBook(page): if page==0: url = 'https://book.douban.com/top250' else: Url = 'https://book.douban.com/top250' + '? Start = '+ STR (page 25) * try: kv = {' the user-agent' : 'Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15', 'Cookie': '__utma = 81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb = 81379588.2.10.1624499388; __utmc=81379588; __utmz = 81379588.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; __utma = 30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb = 30149280.2.10.1624499388; __utmc=30149280; __utmz = 30149280.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; Ac3 _pk_id. 100001.3 = 449 fa3ee36cea64b. 1624331967.5.1624499726.1624430146.; _pk_ses. 100001.3 ac3 = *; Ap_v = 0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref. 100001.3 ac3=%5b%22%22%2c%22%22%2c1624499389%2c%22https%3a%2f%2fwww.baidu.com % 22% 5 d; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ', } r = requests. Get (url,headers=kv,verify=False) time.sleep(random.randint(3,5)) r.aise_for_status () r.encoding=r.apparent_encoding except Exception as e: Print (' crawl error ') HTML = r.ext bs=BeautifulSoup(HTML,"html.parser") return bs def getMessage(soup): Book_list=[] list_books=soup.find_all('tr',class_="item") for book in list_books: #print(list_books) #print(len(list_books)) list_item=[] tag_list=book.find('div',class_="pl2").find('a') title=tag_list.get_text().replace('\n','').strip(' ').replace(' ','') list_item.append(title) link=tag_list['href'] list_item.append(link) tag_author=book.find('p',class_="pl") author=tag_author.string.split('/')[0] list_item.append(author) tag_rate=book.find('span',class_='rating_nums') rating_nums=tag_rate.string list_item.append(rating_nums) tag_judge=book.find('span',class_='pl') Judge = tag_judge. String. Replace (" \ n ", ""). The replace (' (', ' '). The replace (') ', '). The strip (' '). The split (' ') [0] list_item. Append (judge) try: tag_quote=book.find('span',class_='inq').string except: tag_quote=str(None) list_item.append(tag_quote) #print(list_item) Book_list.append(list_item) return Book_list if __name__=='__main__': Wb = workbook. Workbook (ws) = wb. Active ws. Append ([' title ', 'site', 'the authors',' score ', 'comments',' recommended language]) for n in the range (0, 10) : Print (" %d "%(n+1)) print(" %d "%(n+1)) bs=getBook(n) for I in range(25): ws.append([getMessage(bs)[i][0],getMessage(bs)[i][1],getMessage(bs)[i][2],getMessage(bs)[i][3],getMessage(bs)[i][4],getM Essage (bs)[I][5]]) wb.save("bookTop250 下 载. XLSX ") print("Copy the code
The result of the crawl is as follows
Problems encountered in the process of crawling: 1. When crawling, some books do not have recommendations, so often feedback ‘NoneType’ object has no attribute ‘string’, tried if/else exclusion, the effect is not good, finally use try/except to solve
2. Book link In the picture, adopt different ways to obtain the title and website respectively
referenceBeautifulSoup extracts the contents of a tag
Basic crawler library
Python crawler p tag NavigableString get problem 3. After obtaining the title and other content, it is purified and formatted
4. This problem occurs during operation, but it does not affect the result
reference
Requests request request warning: InsecureRequestWarning: Unverified HTTPS Request is being made. Adding certificate ver
5. Webpage information collection methods regular expression, xpath,
Python crawler selector (a) xpath
Specific expansion in the future