1. Access to the blog Park link:
https://www.cnblogs.com/
Copy the code
Discover the way to load more blogs is to load the next page
At the same time we click on the following page, the links to the blog garden change regularly:
https://www.cnblogs.com/#p3
Copy the code
Click on the third page,#p is the number of pages, we can use this number of pages, to get the 200 page web link, the specific code is:
url="https://www.cnblogs.com/"
def get_html(url) : Get the 200 main page links of the blog garden and crawl them into the list
html_list=[]
for i in range(1.201) :# for I in range (1, 2) :
r=requests.get(url+"/#p"+str(i))
r.encoding=r.apparent_encoding
html_list.append(BeautifulSoup(r.text,"html.parser"))
return html_list
Copy the code
2. Extract blog information
The next step is to get the information for each of these blogs, look at the source code of the web page, see where the information is, use beautifulSoup to process the source code of the web page, and do whatever you can to extract the information you want:
def get_text(html) : Store a blog's information in a dictionary, and then store all the dictionaries in a list
dict= {"Name":""."Title":""."Reading volume":""."Number of comments":""
}
text_list = html.find_all("div", class_=re.compile('post_item'))
for i in range(len(text_list)):
try:
text1=text_list[i]
dict["Title"] = text1.h3.a.string
dict["Name"] = text1.div.a.string
dict["Reading volume"] = text1.div.contents[4].a.string[3: -1]
dict["Number of comments"] = text1.div.span.a.string[13: -1]
need_list.append(dict.copy()) Why copy() can read my last blog post
except AttributeError:
continue
return need_list
Copy the code
Write dictionary blog links to the list
3. Write the crawl information into excel
I use the XlsXWriter library to operate
def write_xlsx(need_list) : Write the crawl information to the Excel table
workbook = xlsxwriter.Workbook('excel.xlsx')
worksheet = workbook.add_worksheet('Sheet1')
for i in range(1.len(need_list)):
worksheet.write('A'+str(i), need_list[i]["Title"])
worksheet.write('B'+str(i), need_list[i]["Name"])
worksheet.write('C'+str(i), need_list[i]["Reading volume"])
worksheet.write('D'+str(i), need_list[i]["Number of comments"])
print("yes")
workbook.close()
Copy the code
That’s it!
Finally, the source code is given:
import requests
from bs4 import BeautifulSoup
import re
import xlsxwriter
need_list=[]
url="https://www.cnblogs.com/"
def get_html(url) : Get the 200 main page links of the blog garden and crawl them into the list
html_list=[]
for i in range(1.201) :# for I in range (1, 2) :
r=requests.get(url+"/#p"+str(i))
r.encoding=r.apparent_encoding
html_list.append(BeautifulSoup(r.text,"html.parser"))
return html_list
def get_text(html) : Store a blog's information in a dictionary, and then store all the dictionaries in a list
dict= {"Name":""."Title":""."Reading volume":""."Number of comments":""
}
text_list = html.find_all("div", class_=re.compile('post_item'))
for i in range(len(text_list)):
try:
text1=text_list[i]
dict["Title"] = text1.h3.a.string
dict["Name"] = text1.div.a.string
dict["Reading volume"] = text1.div.contents[4].a.string[3: -1]
dict["Number of comments"] = text1.div.span.a.string[13: -1]
need_list.append(dict.copy()) Why copy() can read my last blog post
except AttributeError:
continue
return need_list
def get(html_list) : Get all blog info for 200 pages
for i in range(len(html_list)):
#for i in range(1):
html=html_list[i]
get_text(html)
def write_xlsx(need_list) : Write the crawl information to the Excel table
workbook = xlsxwriter.Workbook('excel.xlsx')
worksheet = workbook.add_worksheet('Sheet1')
for i in range(1.len(need_list)):
worksheet.write('A'+str(i), need_list[i]["Title"])
worksheet.write('B'+str(i), need_list[i]["Name"])
worksheet.write('C'+str(i), need_list[i]["Reading volume"])
worksheet.write('D'+str(i), need_list[i]["Number of comments"])
print("yes")
workbook.close()
def main() :
html_list=get_html(url)
get(html_list)
write_xlsx(need_list)
main()
Copy the code