Github address: github.com/pasca520/Py…

preface

Some crawler exercises about organizing daily exercises can be used for learning.

Crawl projects are learning-oriented, using as many modules as possible for practice, rather than optimal solutions.

The crawler profile

The sample Python library
Crawl module request
Parsing module BeautifulSoup
Storage type List (easy to store in database)

parsing

Code sample

# -*- coding: utf-8 -*-

import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
from bs4 import BeautifulSoup


# crawler body
def get_page(url):
    headers = {
        'Connection': 'keep-alive'.'Cache-Control': 'max-age=0'.'User-Agent': 'the Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'.'Accept': 'text/html,application/xhtml+xml,application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8, application/signed - exchange; v=b3'.'Referer': 'https://maoyan.com/board',
    }

    try:
        response = requests.get(url=url, headers=headers).text
        return response
    except ReadTimeout:  # access timeout error
        print('Timeout')
    except ConnectionError:  Network disconnection error
        print('Connect error')
    except RequestException:  # superclass error
        print('Error')


# Parse web pages
def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')
    grid = soup.find(name="ol", attrs={"class": "grid_view"})
    movie_list = grid.find_all("li")
    for movie in movie_list:
        rank = movie.find(name="em").getText()
        name = movie.find(name="span", attrs={"class": "title"}).getText()
        rating_num = movie.find(name="span", attrs={"class": "rating_num"}).getText()
        # bd = movie.find(name="p").getText().strip().replace(' ', '\n').replace('... \n ', '... \n').replace('/', '\n').split('\n') # scalp scalp string decomposition series, because the exercise did not use re, sure enough the native string processing trouble one, strip remove Spaces, replace replace, aims to store different information classification into different parameters, Such as director, lead actor, release date, release date and type of movie
        bd = movie.find(name="p").getText().strip().replace(' '.'\n').replace('... \n '.'... \n').replace('/'.'\n').split('\n')  Strip removes Spaces, replace replaces, and is designed to store different information categories into different parameters, such as director, star, release date, release date, and movie genre

        # Douban some of the leading actors did not... It hurts so much that I have to write bad code for simplicity and add it again
        if len(bd) == 4:
            bd.insert(1, 'Did not climb')
        inq = movie.find(name="span", attrs={"class": "inq"})
        # handle inQ as null
        if not inq:
            inq = "No."
        else:
            inq = inq.getText()

         # Here to store directly to the dictionary, convenient to save to the database
        douBanDict['rank'] = rank
        douBanDict['name'] = name
        douBanDict['director'] = bd[0]
        douBanDict['actor'] = bd[1]
        douBanDict['release_time'] = bd[2].strip()  # Some lists have Spaces, strip() to remove Spaces
        douBanDict['country'] = bd[3]
        douBanDict['movie_types'] = bd[4]
        douBanDict['rating_num'] = rating_num
        douBanDict['inq'] = inq
        douBanList.append(str(douBanDict))  The dictionary is converted to a string and then added to the list, otherwise the dictionary value will keep changing
    return douBanList

if __name__ == '__main__':
    douBanList = []
    douBanDict = {}
    for start in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(start)
        html = get_page(url)
        douBanList = parse_page(html)
    print(douBanList)


Copy the code

Data is stored

A dictionary containing information about each movie in a list format.

Done!