1. Access to the blog Park link:

       https://www.cnblogs.com/
Copy the code

Discover the way to load more blogs is to load the next page



At the same time we click on the following page, the links to the blog garden change regularly:

      https://www.cnblogs.com/#p3
Copy the code

Click on the third page,#p is the number of pages, we can use this number of pages, to get the 200 page web link, the specific code is:

url="https://www.cnblogs.com/"

def get_html(url) :         Get the 200 main page links of the blog garden and crawl them into the list
    html_list=[]
    for i in range(1.201) :# for I in range (1, 2) :

        r=requests.get(url+"/#p"+str(i))

        r.encoding=r.apparent_encoding
        html_list.append(BeautifulSoup(r.text,"html.parser"))
    return html_list
Copy the code

2. Extract blog information

The next step is to get the information for each of these blogs, look at the source code of the web page, see where the information is, use beautifulSoup to process the source code of the web page, and do whatever you can to extract the information you want:

def get_text(html) :      Store a blog's information in a dictionary, and then store all the dictionaries in a list
    dict= {"Name":""."Title":""."Reading volume":""."Number of comments":""
    }
    text_list = html.find_all("div", class_=re.compile('post_item'))
    for i in range(len(text_list)):
        try:
            text1=text_list[i]
            dict["Title"] = text1.h3.a.string
            dict["Name"] = text1.div.a.string
            dict["Reading volume"] = text1.div.contents[4].a.string[3: -1]
            dict["Number of comments"] = text1.div.span.a.string[13: -1]


            need_list.append(dict.copy())      Why copy() can read my last blog post
        except AttributeError:
            continue
    return need_list
Copy the code

Write dictionary blog links to the list

3. Write the crawl information into excel

I use the XlsXWriter library to operate

def write_xlsx(need_list) :  Write the crawl information to the Excel table
    workbook = xlsxwriter.Workbook('excel.xlsx')
    worksheet = workbook.add_worksheet('Sheet1')
    for i in range(1.len(need_list)):
        worksheet.write('A'+str(i), need_list[i]["Title"])
        worksheet.write('B'+str(i), need_list[i]["Name"])
        worksheet.write('C'+str(i), need_list[i]["Reading volume"])
        worksheet.write('D'+str(i), need_list[i]["Number of comments"])

        print("yes")
    workbook.close()
Copy the code

That’s it!

Finally, the source code is given:

import requests
from bs4 import BeautifulSoup
import re

import xlsxwriter



need_list=[]
url="https://www.cnblogs.com/"

def get_html(url) :         Get the 200 main page links of the blog garden and crawl them into the list
    html_list=[]
    for i in range(1.201) :# for I in range (1, 2) :

        r=requests.get(url+"/#p"+str(i))

        r.encoding=r.apparent_encoding
        html_list.append(BeautifulSoup(r.text,"html.parser"))
    return html_list

def get_text(html) :      Store a blog's information in a dictionary, and then store all the dictionaries in a list
    dict= {"Name":""."Title":""."Reading volume":""."Number of comments":""
    }
    text_list = html.find_all("div", class_=re.compile('post_item'))
    for i in range(len(text_list)):
        try:
            text1=text_list[i]
            dict["Title"] = text1.h3.a.string
            dict["Name"] = text1.div.a.string
            dict["Reading volume"] = text1.div.contents[4].a.string[3: -1]
            dict["Number of comments"] = text1.div.span.a.string[13: -1]


            need_list.append(dict.copy())      Why copy() can read my last blog post
        except AttributeError:
            continue
    return need_list


def get(html_list) :       Get all blog info for 200 pages
    for i in range(len(html_list)):
    #for i in range(1):
        html=html_list[i]
        get_text(html)


def write_xlsx(need_list) :  Write the crawl information to the Excel table
    workbook = xlsxwriter.Workbook('excel.xlsx')
    worksheet = workbook.add_worksheet('Sheet1')
    for i in range(1.len(need_list)):
        worksheet.write('A'+str(i), need_list[i]["Title"])
        worksheet.write('B'+str(i), need_list[i]["Name"])
        worksheet.write('C'+str(i), need_list[i]["Reading volume"])
        worksheet.write('D'+str(i), need_list[i]["Number of comments"])

        print("yes")
    workbook.close()
def main() :
    html_list=get_html(url)
    get(html_list)
    write_xlsx(need_list)


main()
Copy the code