In this article, we will use Requests and xpath to climb douban movie Top250. The final renderings are pasted below:
1. Web page analysis
(1) Analysis of URL laws
We first use Chrome browser to open Top250 douban movies, it is easy to judge that the site is a static page
Then we analyze the URL law of the website, so as to obtain the content of all pages in the website by constructing THE URL
Home page: movie.douban.com/top250
Page 2: movie.douban.com/top250?star…
Page 3: movie.douban.com/top250?star…
…
It is not hard to see that the URL can be generalized to movie.douban.com/top250?star… Where page indicates the number of pages
Finally, we need to verify whether the URL of the home page also meets the rule. After verification, it is easy to find that the URL of the home page also meets the rule above
The core code is as follows:
import requests
Get the source code of the web page
def get_page(url) :
Construct the request header
headers = {
'USER-AGENT':'the Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
Send the request and get the response
response = requests.get(url=url,headers=headers)
Get the source code for the web page
html = response.text
# return to page source code
return html
Copy the code
(2) Analyze the law of content
Next we began to analyze the content of each web page and extract the data we needed from it
Use the shortcut Ctrl+Shift+I to open the developer tool and select the Elements option bar to analyze the source code of the web page
The data that needs to be extracted includes (can be matched using xpath) :
- Detailed links:
html.xpath('//div[@class="hd"]/a/@href')
- Movie title:
html.xpath('//div[@class="hd"]/a/span[1]/text()')
- Director/Starring Actor, Release year/Country/category:
html.xpath('//div[@class="bd"]/p[1]//text()')
- Douban score:
html.xpath('//div[@class="bd"]/div/span[2]/text()')
- Number of evaluators:
html.xpath('//div[@class="bd"]/div/span[4]/text()')
The core code is as follows:
from lxml import etree
# Parse the web source code
def parse_page(html) :
Construct the _Element object
html_elem = etree.HTML(html)
# detailed link
links = html_elem.xpath('//div[@class="hd"]/a/@href')
# Movie title
titles = html_elem.xpath('//div[@class="hd"]/a/span[1]/text()')
# Movie info (Director/star, release year/country/category)
infos = html_elem.xpath('//div[@class="bd"]/p[1]//text()')
roles = [j for i,j in enumerate(infos) if i % 2= =0]
descritions = [j for i,j in enumerate(infos) if i % 2! =0]
# Douban rating
stars = html_elem.xpath('//div[@class="bd"]/div/span[2]/text()')
# comment count
comments = html_elem.xpath('//div[@class="bd"]/div/span[4]/text()')
# Get results
data = zip(links,titles,roles,descritions,stars,comments)
# return result
return data
Copy the code
(3) Save data
Save the data as TXT file, JSON file, and CSV file respectively
import json
import csv
# Open file
def openfile(fm) :
fd = None
if fm == 'txt':
fd = open('douban.txt'.'w',encoding='utf-8')
elif fm == 'json':
fd = open('douban.json'.'w',encoding='utf-8')
elif fm == 'csv':
fd = open('douban.csv'.'w',encoding='utf-8',newline=' ')
return fd
Save the data to a file
def save2file(fm,fd,data) :
if fm == 'txt':
for item in data:
fd.write('----------------------------------------\n')
fd.write('the link: + str(item[0]) + '\n')
fd.write('the title: + str(item[1]) + '\n')
fd.write('role: + str(item[2]) + '\n')
fd.write('descrition:' + str(item[3]) + '\n')
fd.write('star: + str(item[4]) + '\n')
fd.write('the comment: + str(item[5]) + '\n')
if fm == 'json':
temp = ('link'.'title'.'role'.'descrition'.'star'.'comment')
for item in data:
json.dump(dict(zip(temp,item)),fd,ensure_ascii=False)
if fm == 'csv':
writer = csv.writer(fd)
for item in data:
writer.writerow(item)
Copy the code
2. Coding implementation
Here is the complete code, and dozens of lines can be written
import requests
from lxml import etree
import json
import csv
import time
import random
Get the source code of the web page
def get_page(url) :
headers = {
'USER-AGENT':'the Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
html = response.text
return html
# Parse the web source code
def parse_page(html) :
html_elem = etree.HTML(html)
links = html_elem.xpath('//div[@class="hd"]/a/@href')
titles = html_elem.xpath('//div[@class="hd"]/a/span[1]/text()')
infos = html_elem.xpath('//div[@class="bd"]/p[1]//text()')
roles = [j.strip() for i,j in enumerate(infos) if i % 2= =0]
descritions = [j.strip() for i,j in enumerate(infos) if i % 2! =0]
stars = html_elem.xpath('//div[@class="bd"]/div/span[2]/text()')
comments = html_elem.xpath('//div[@class="bd"]/div/span[4]/text()')
data = zip(links,titles,roles,descritions,stars,comments)
return data
# Open file
def openfile(fm) :
fd = None
if fm == 'txt':
fd = open('douban.txt'.'w',encoding='utf-8')
elif fm == 'json':
fd = open('douban.json'.'w',encoding='utf-8')
elif fm == 'csv':
fd = open('douban.csv'.'w',encoding='utf-8',newline=' ')
return fd
Save the data to a file
def save2file(fm,fd,data) :
if fm == 'txt':
for item in data:
fd.write('----------------------------------------\n')
fd.write('the link: + str(item[0]) + '\n')
fd.write('the title: + str(item[1]) + '\n')
fd.write('role: + str(item[2]) + '\n')
fd.write('descrition:' + str(item[3]) + '\n')
fd.write('star: + str(item[4]) + '\n')
fd.write('the comment: + str(item[5]) + '\n')
if fm == 'json':
temp = ('link'.'title'.'role'.'descrition'.'star'.'comment')
for item in data:
json.dump(dict(zip(temp,item)),fd,ensure_ascii=False)
if fm == 'csv':
writer = csv.writer(fd)
for item in data:
writer.writerow(item)
# Start crawling the page
def crawl() :
url = 'https://movie.douban.com/top250?start={page}&filter='
fm = input('Please input file save format (TXT, JSON, CSV) :')
whilefm! ='txt' andfm! ='json' andfm! ='csv':
fm = input('Input error, please re-enter file save format (TXT, JSON, CSV) :')
fd = openfile(fm)
print('Start crawling')
for page in range(0.250.25) :print('Climbing to number one' + str(page+1) + 'Page to page' + str(page+25) + ' 页......')
html = get_page(url.format(page=str(page)))
data = parse_page(html)
save2file(fm,fd,data)
time.sleep(random.random())
fd.close()
print('End of crawl')
if __name__ == '__main__':
crawl()
Copy the code
2020.06.25 supplement
Recently, because of the need of the project, I also took the next douban to read, and the idea is similar to the douban movie. Here I also posted the code for your reference
import requests
from lxml import etree
from string import Template
import time
import random
import json
import re
def get_page(url) :
headers = {
'USER-AGENT':'the Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(url = url, headers = headers)
html = response.text
return html
def parse_page(html) :
html_elem = etree.HTML(html)
# cover
imgUrl = html_elem.xpath('//tr[@class="item"]/td[1]/a/img/@src')
# titles
name = html_elem.xpath('//tr[@class="item"]/td[2]/div[@class="pl2"]/a/@title')
# introduction
info = html_elem.xpath('//tr[@class="item"]/td[2]/p[@class="pl"]/text()')
# evaluation
star = html_elem.xpath('//div[contains(@class, "star")]/span[@class="rating_nums"]/text()')
# comment count
comments = html_elem.xpath('//div[contains(@class, "star")]/span[@class="pl"]/text()')
# in this paper,
quotes = html_elem.xpath('//span[@class="inq"]/text()')
# Break the introduction into author, translator, price, publication date, publisher
author, translator, price, pubdate, press = [], [], [], [], []
for item in info:
splits = item.split('/')
splits = [item.strip() for item in splits]
#, sentenced to
if item == '[UK] Conan Doyle/Ding Zhonghua et al/Masses Publishing House / 1981-8/53.00 yuan /68.00 Yuan ':
author.append(splits[0])
translator.append(splits[1])
press.append(splits[2])
pubdate.append(splits[3])
price.append(splits[4])
#, sentenced to
elif item == 'S.A. Alexievich/Fang Zufang/Huacheng Publishing House/Iron Gourd Books / 2014-6-15/34.80 YUAN ':
author.append(splits[0])
translator.append(splits[1])
press.append(splits[2])
pubdate.append(splits[4])
price.append(splits[5])
# General treatment
else:
author.append(splits[0])
translator.append(' ' if len(splits) == 4 else splits[1])
price.append(splits[-1])
pubdate.append(splits[-2])
press.append(splits[-3])
Matches integers and floating point numbers
pattern = r'[+-]? ([0-9] * \.? [0-9] + | [0-9] + \.? [0-9]*)([eE][+-]? [0-9] +)? '
star = [float(re.search(pattern, item).group()) for item in star]
comments = [int(re.search(pattern, item.lstrip('(').rstrip(') ').strip()).group()) for item in comments]
price = [float(re.search(pattern, item).group()) for item in price]
data = list(zip(imgUrl, name, star, comments, quotes, author, translator, price, pubdate, press))
return data
def save2file(data) :
meta = ('imgUrl'.'name'.'star'.'comments'.'quotes'.'author'.'translator'.'price'.'pubdate'.'press')
wrapper = [dict(zip(meta, item)) for item in data]
fd = open('douban.json'.'w', encoding = 'utf-8')
json.dump(wrapper, fd, ensure_ascii = False)
fd.close()
def crawl() :
Url template, whose parameters need to be replaced
init_url = Template('https://book.douban.com/top250?start=$page')
# All data
all_data = []
# Traverse the web page
for page in range(0.250.25) :# current url
curr_url = init_url.substitute(page = str(page))
# fetch page
html = get_page(curr_url)
# parse the page
data = parse_page(html)
Save data for each page
all_data.extend(data)
# Random sleep
time.sleep(random.random())
Write all data to the file at once
save2file(all_data)
if __name__ == '__main__':
crawl()
Copy the code
- Web analytics
- The final result