Write the contents of the crawl to Excel using XLWT
import re import requests import time import random import xlwt from bs4 import BeautifulSoup def getBook(page): if page==0: url = 'https://book.douban.com/top250' else: Url = 'https://book.douban.com/top250' + '? Start = '+ STR (page 25) * try: kv = {' the user-agent' : 'Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15', 'Cookie': '__utma = 81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb = 81379588.2.10.1624499388; __utmc=81379588; __utmz = 81379588.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; __utma = 30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb = 30149280.2.10.1624499388; __utmc=30149280; __utmz = 30149280.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; Ac3 _pk_id. 100001.3 = 449 fa3ee36cea64b. 1624331967.5.1624499726.1624430146.; _pk_ses. 100001.3 ac3 = *; Ap_v = 0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref. 100001.3 ac3=%5b%22%22%2c%22%22%2c1624499389%2c%22https%3a%2f%2fwww.baidu.com % 22% 5 d; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ', } r = requests. Get (url,headers=kv,verify=False) time.sleep(random.randint(3,5)) r.aise_for_status () r.encoding=r.apparent_encoding except Exception as e: Print (' crawl error ') HTML = r.ext bs=BeautifulSoup(HTML,"html.parser") return bs def getNames(bs): titles=[] lists=bs.find_all('div',class_="pl2") for item in lists: name=item.find('a').get('title') titles.append(name) return titles def getUrls(bs): urls=[] lists=bs.find_all('div',class_="pl2") for item in lists: url=item.find('a').get('herf') urls.append(url) return urls def getBasicMessage(bs): lists=bs.find_all('p',class_="pl") messages=[] for item in lists: message=item.string messages.append(message) return messages def getStars(bs): lists=bs.find_all('span',class_="rating_nums") stars=[] for item in lists: star=float(item.string) stars.append(star) return stars def getPeopleNumbers(bs): peoples=[] lists=bs.find_all('span',class_="pl") for item in lists: people=item.text peoples.append(people) num=[] for i in peoples: r = re.findall(r"\d+\.? \d*",i) num.append(int(r[0])) return num def getInq(bs): Inqs=[] lists=bs.find_all('span',class_="inq") for item in lists: inq=item.text Inqs.append(inq) return Inqs if __name__=='__main__': Book =xlwt.Workbook(encoding='utf-8') sheet=book.add_sheet(' TOP250',cell_overwrite_ok=True) sheet.write(0,0,' titlebook ') Sheet. Write (0, 1, 'url') sheet. Write (0, 2, 'basic information') sheet. Write (0, 3, 'score') sheet. Write (0, 4, 'evaluation number') # sheet. Write (0, 5, 'recommendations') for N in range(0,10): print(" print on %d "%(n+1)) bs=getBook(n) for I in range(0,25): print(" print on %d "%(n+1)) Sheet. Write (n * 25 + I + 1, 0, getNames (bs) [I]) sheet. Write (n * 25 + I + 1, 1, getUrls (bs) [I]) Sheet. Write (n * 25 + I + 1, 2, getBasicMessage (bs) [I]) sheet. Write (n * 25 + I + 1, 3, getStars (bs) [I]) Sheet. Write (n*25+ I +1,4,getPeopleNumbers(bs)[I]) #sheet. Write (I +1,5,getInq(bs)[I]) book.save(u' Print (" crawl done ")Copy the code
Write Excel with OpenPyXL
import re import requests import time import random from openpyxl import workbook,load_workbook from bs4 import BeautifulSoup def getBook(page): if page==0: url = 'https://book.douban.com/top250' else: Url = 'https://book.douban.com/top250' + '? Start = '+ STR (page 25) * try: kv = {' the user-agent' : 'Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15', 'Cookie': '__utma = 81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb = 81379588.2.10.1624499388; __utmc=81379588; __utmz = 81379588.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; __utma = 30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb = 30149280.2.10.1624499388; __utmc=30149280; __utmz = 30149280.1624499388.5.5.utmcsr=baidu.com | utmccn = (referral) | utmcmd = referral | utmcct = /; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; Ac3 _pk_id. 100001.3 = 449 fa3ee36cea64b. 1624331967.5.1624499726.1624430146.; _pk_ses. 100001.3 ac3 = *; Ap_v = 0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref. 100001.3 ac3=%5b%22%22%2c%22%22%2c1624499389%2c%22https%3a%2f%2fwww.baidu.com % 22% 5 d; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ', } r = requests. Get (url,headers=kv,verify=False) time.sleep(random.randint(3,5)) r.aise_for_status () r.encoding=r.apparent_encoding except Exception as e: Print (' crawl error ') HTML = r.ext bs=BeautifulSoup(HTML,"html.parser") return bs def getNames(bs): titles=[] lists=bs.find_all('div',class_="pl2") for item in lists: name=item.find('a').get('title') titles.append(name) return titles def getUrls(bs): urls=[] lists=bs.find_all('div',class_="pl2") for item in lists: url=item.find('a').get('herf') urls.append(url) return urls def getBasicMessage(bs): lists=bs.find_all('p',class_="pl") messages=[] for item in lists: message=item.string messages.append(message) return messages def getStars(bs): lists=bs.find_all('span',class_="rating_nums") stars=[] for item in lists: star=float(item.string) stars.append(star) return stars def getPeopleNumbers(bs): peoples=[] lists=bs.find_all('span',class_="pl") for item in lists: people=item.text peoples.append(people) num=[] for i in peoples: r = re.findall(r"\d+\.? \d*",i) num.append(int(r[0])) return num def getInq(bs): Inqs=[] lists=bs.find_all('span',class_="inq") for item in lists: if len(item.text)! =0: inqs.append (item.text) else: inqs.append (' no comment ') return Inqs if __name__=='__main__': Wb = workbook.workbook () ws=wb.active ws.append([' title ',' url ',' basic info ',' score ',' comment ']) for n in range(0,10): Print (" %d "%(n+1)) print(" %d "%(n+1)) bs=getBook(n) for I in range(25): ws.append([getNames(bs)[i],getUrls(bs)[i],getBasicMessage(bs)[i],getStars(bs)[i],getPeopleNumbers(bs)[i]]) Wb. save(" booktop250.xlsx ") print(" done ")Copy the code
For specific analysis, refer to relevant web pages
Attempt to Overwrite cell: SheetName = ‘Sheet1’ Rowx =1 COLx =0
Simple to use openpyxl (openpyxl. Readthedocs. IO/en/stable/t…