In this article, we will use urllib and RE modules to climb baidu Tieba, and use three file formats to store data

1. Web page analysis

(1) Preparation

First of all, we use Chrome browser to open Baidu Tieba and enter key words in the input field to search. The example here is “Computer Bar”.

(2) Analysis of URL laws

Next we start to analyze the URL rules of the site, so that we can construct the URL to get the content of all the pages in the site

Page 1: tieba.baidu.com/f?kw= Computer & IE… Page 2: tieba.baidu.com/f?kw= Computer & IE… Page 3: tieba.baidu.com/f?kw= Computer & IE… …

Through observation, it is not difficult to find that its URL is very regular, and the main request parameters are analyzed as follows:

  • kw: Search keyword, using URL encoding, can passurllib.parse.quote()Method implementation
  • ie: Character encoding format, the value is UTF-8
  • pn: The page number of the current page and increases in steps by 50

So the full URL can be generalized as follows: tieba.baidu.com/f?kw={keywo…

The core code is as follows:

import urllib.request
import urllib.parse
Get the source code of the web page
def get_page(url) :
    Construct the request header
    headers = {
        'USER-AGENT':'the Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    Construct the request object
    req = urllib.request.Request(url=url,headers=headers)
    Send a request and get a response
    response = urllib.request.urlopen(req)
    Get the source code for the web page
    html = response.read().decode('utf-8')
    # return to page source code
    return html
Copy the code

(3) Analyze the law of content

Next, we directly use the shortcut key Ctrl+U to open the source code of the web page, and carefully analyze the data we need to capture in each page

It is easy to find that the content of each post is contained in a <li> tag, and we can use regular expressions to match, including:

  • Topic Name:r'href="/p/\d+" title="(.+?) "'
  • Subject author:R 'title=" (.+?) "'
  • Link address:r'href="/p/(\d+)"'
  • Reply:R "title =" reply "> (\ d +) < '
  • Date created:R 'title=" create time ">(.+?) < '

The core code is as follows:

import re
# Parse web source code, extract data
def parse_page(html) :
    # theme name
    titles = re.findall(r'href="/p/\d+" title="(.+?) "',html)
    # Theme author
    authods = re.findall(R 'title=" (.+?) "',html)
    # link address
    nums = re.findall(r'href="/p/(\d+)"',html)
    links = ['http://tieba.baidu.com/p/'+str(num) for num in nums]
    # Number of replies
    focus = re.findall(R "title =" reply "> (\ d +) ',html)
    # create time
    ctimes = re.findall(R 'title=" create time ">(.+?) < ',html)
    # Get results
    data = zip(titles,authods,links,focus,ctimes)
    # return result
    return data
Copy the code

(4) Save data

Save the data as TXT files, JSON files, and CSV files

import json
import csv
# Open file
def openfile(fm,fileName) :
    fd = None
    if fm == 'txt':
        fd = open(fileName+'.txt'.'w',encoding='utf-8')
    elif fm == 'json':
        fd = open(fileName+'.json'.'w',encoding='utf-8')
    elif fm == 'csv':
        fd = open(fileName+'.csv'.'w',encoding='utf-8',newline=' ')
    return fd

Save the data to a file
def save2file(fm,fd,data) :
    if fm == 'txt':
        for item in data:
            fd.write('----------------------------------------\n')
            fd.write('the title: + str(item[0]) + '\n')
            fd.write('authod:' + str(item[1]) + '\n')
            fd.write('the link: + str(item[2]) + '\n')
            fd.write('the focus: + str(item[3]) + '\n')
            fd.write('ctime:' + str(item[4]) + '\n')
    if fm == 'json':
        temp = ('title'.'authod'.'link'.'focus'.'ctime')
        for item in data:
            json.dump(dict(zip(temp,item)),fd,ensure_ascii=False)
    if fm == 'csv':
        writer = csv.writer(fd)
        for item in data:
            writer.writerow(item)
Copy the code

2. Coding implementation

The complete code is below, again quite simple, less than 100 lines

import urllib.request
import urllib.parse
import re
import json
import csv
import time
import random

Get the source code of the web page
def get_page(url) :
    headers = {
        'USER-AGENT':'the Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    req = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    return html

# Parse web source code, extract data
def parse_page(html) :
    titles = re.findall(r'href="/p/\d+" title="(.+?) "',html)
    authods = re.findall(R 'title=" (.+?) "',html)
    nums = re.findall(r'href="/p/(\d+)"',html)
    links = ['http://tieba.baidu.com/p/'+str(num) for num in nums]
    focus = re.findall(R "title =" reply "> (\ d +) ',html)
    ctimes = re.findall(R 'title=" create time ">(.+?) < ',html)
    data = zip(titles,authods,links,focus,ctimes)
    return data

# Open file
def openfile(fm,fileName) :
    if fm == 'txt':
        return open(fileName+'.txt'.'w',encoding='utf-8')
    elif fm == 'json':
        return open(fileName+'.json'.'w',encoding='utf-8')
    elif fm == 'csv':
        return open(fileName+'.csv'.'w',encoding='utf-8',newline=' ')
    else:
        return None

Save the data to a file
def save2file(fm,fd,data) :
    if fm == 'txt':
        for item in data:
            fd.write('----------------------------------------\n')
            fd.write('the title: + str(item[0]) + '\n')
            fd.write('authod:' + str(item[1]) + '\n')
            fd.write('the link: + str(item[2]) + '\n')
            fd.write('the focus: + str(item[3]) + '\n')
            fd.write('ctime:' + str(item[4]) + '\n')
    if fm == 'json':
        temp = ('title'.'authod'.'link'.'focus'.'ctime')
        for item in data:
            json.dump(dict(zip(temp,item)),fd,ensure_ascii=False)
    if fm == 'csv':
        writer = csv.writer(fd)
        for item in data:
            writer.writerow(item)

# Start crawling the page
def crawl() :
    kw = input('Please enter the theme post bar name:')
    base_url = 'http://tieba.baidu.com/f?kw=' + urllib.parse.quote(kw) + '&ie=utf-8&pn={page}'
    fm = input('Please input file save format (TXT, JSON, CSV) :')
    whilefm! ='txt' andfm! ='json' andfm! ='csv':
        fm = input('Input error, please re-enter file save format (TXT, JSON, CSV) :')
    fd = openfile(fm,kw)
    page = 0
    total_page = int(re.findall((\d+),get_page(base_url.format(page=str(0))))0])
    print('Start crawling')
    while page < total_page:
        print('Climbing to number one'.int(page/50+1), '页.......')
        html = get_page(base_url.format(page=str(page)))
        data = parse_page(html)
        save2file(fm,fd,data)
        page += 50
        time.sleep(random.random())
    fd.close()
    print('End of crawl')

if __name__ == '__main__':
    crawl()
Copy the code