Github address: github.com/pasca520/Py…
preface
Some crawler exercises about organizing daily exercises can be used for learning.
Crawl projects are learning-oriented, using as many modules as possible for practice, rather than optimal solutions.
The crawler profile
The sample | Python library |
---|---|
Crawl module | request |
Parsing module | BeautifulSoup |
Storage type | List (easy to store in database) |
parsing
Code sample
# -*- coding: utf-8 -*-
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
from bs4 import BeautifulSoup
# crawler body
def get_page(url):
headers = {
'Connection': 'keep-alive'.'Cache-Control': 'max-age=0'.'User-Agent': 'the Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'.'Accept': 'text/html,application/xhtml+xml,application/xml; Q = 0.9, image/webp image/apng, * / *; Q = 0.8, application/signed - exchange; v=b3'.'Referer': 'https://maoyan.com/board',
}
try:
response = requests.get(url=url, headers=headers).text
return response
except ReadTimeout: # access timeout error
print('Timeout')
except ConnectionError: Network disconnection error
print('Connect error')
except RequestException: # superclass error
print('Error')
# Parse web pages
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
grid = soup.find(name="ol", attrs={"class": "grid_view"})
movie_list = grid.find_all("li")
for movie in movie_list:
rank = movie.find(name="em").getText()
name = movie.find(name="span", attrs={"class": "title"}).getText()
rating_num = movie.find(name="span", attrs={"class": "rating_num"}).getText()
# bd = movie.find(name="p").getText().strip().replace(' ', '\n').replace('... \n ', '... \n').replace('/', '\n').split('\n') # scalp scalp string decomposition series, because the exercise did not use re, sure enough the native string processing trouble one, strip remove Spaces, replace replace, aims to store different information classification into different parameters, Such as director, lead actor, release date, release date and type of movie
bd = movie.find(name="p").getText().strip().replace(' '.'\n').replace('... \n '.'... \n').replace('/'.'\n').split('\n') Strip removes Spaces, replace replaces, and is designed to store different information categories into different parameters, such as director, star, release date, release date, and movie genre
# Douban some of the leading actors did not... It hurts so much that I have to write bad code for simplicity and add it again
if len(bd) == 4:
bd.insert(1, 'Did not climb')
inq = movie.find(name="span", attrs={"class": "inq"})
# handle inQ as null
if not inq:
inq = "No."
else:
inq = inq.getText()
# Here to store directly to the dictionary, convenient to save to the database
douBanDict['rank'] = rank
douBanDict['name'] = name
douBanDict['director'] = bd[0]
douBanDict['actor'] = bd[1]
douBanDict['release_time'] = bd[2].strip() # Some lists have Spaces, strip() to remove Spaces
douBanDict['country'] = bd[3]
douBanDict['movie_types'] = bd[4]
douBanDict['rating_num'] = rating_num
douBanDict['inq'] = inq
douBanList.append(str(douBanDict)) The dictionary is converted to a string and then added to the list, otherwise the dictionary value will keep changing
return douBanList
if __name__ == '__main__':
douBanList = []
douBanDict = {}
for start in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(start)
html = get_page(url)
douBanList = parse_page(html)
print(douBanList)
Copy the code
Data is stored
A dictionary containing information about each movie in a list format.
Done!