This article is participating in Python Theme Month. See the link for details
The preparatory work
Before we start, we need to assemble the corresponding environment and so on.
Because docker already exists locally, directly pull JupyterHub image for local running, using JupyterHub code writing.
- First of all, to correspondWeb sitePull the mirror image.
- Will run the corresponding image.
- Use a browser to open and log in. Account:
jovyan
Password:hipaddle
. (Password can be entered into the container:/etc/jupyter/jupyterhub_config.py
View)
The code
We first get resources are from the movie heaven, we need to be familiar with the page, at the same time we also need to look at the page source elements, here is obtained by parsing the source elements of the page.
Analyze requirements
- Preferred to view the home page, movie page list
url
The rule. - Go to the details of each movie to see the location of the desired element in the source code of the web page.
- Assemble each movie element into one
obj
, continue to find the location of the next movie element.
Write the code
- First, we need to fix the page number of the document we need to loop in:
for index in range(2.3) :# 1. The url of the movie list
url = base_url.format(index)
# 2. Get all movies on the current page
detail_urls = get_detail_urls(url)
# 3. Parse the details page for each movie
for key, detail_url in enumerate(detail_urls):
# print('索引:' + str(key) + ',地址:' + detail_url)
# print(' parse detail page :' + detail_url)
film = parse_detail_page(detail_url)
films.append(film)
# 4. Sleep for 2 seconds each time you climb a page
time.sleep(1)
Copy the code
- Get resources for the detail page
.# [Data - Movie title]
title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()') [0]
# Get the zoom TAB
zoom_element = html_element.xpath('//div[@id="Zoom"]') [0]
# [Data - movie covers and screenshots]
imgs = zoom_element.xpath(".//img/@src")...Copy the code
- The corresponding information is assembled
film = {
'title': title,
'cover': cover,
'screen_shot': screen_shot,
'year': year,
'country': country,
'type': type.'rating': rating,
'duration': duration,
'director': director,
'actors': actors,
'desc': desc,
'download_url': download_url
}
......
Copy the code
Effect of screenshots
Complete code:
import requests
from lxml import etree
import time
# home address
BASE_DOMAIN = 'http://www.dytt8.net'
HEADERS = {
'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',}def get_detail_urls(url) :
response = requests.get(url, headers=HEADERS)
html_element = etree.HTML(response.text)
# data - String list details page address
detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href')
# Filter out dirty data caused by [composite movie]
detail_urls_new = detail_urls
for index, detail_url in enumerate(detail_urls_new):
if detail_url == '/html/gndy/jddy/index.html':
detail_urls.remove(detail_url)
The address of the assembly details page
detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls)
return detail_urls
def parse_detail_page(detail_url) :
response = requests.get(detail_url, headers=HEADERS)
text = response.content.decode('gb18030')
html_element = etree.HTML(text)
# [Data - Movie title]
title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()') [0]
# Get the zoom TAB
zoom_element = html_element.xpath('//div[@id="Zoom"]') [0]
# [Data - movie covers and screenshots]
imgs = zoom_element.xpath(".//img/@src")
# Note: To avoid dirty data causing the application to hang, early initialization
year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = ' '.' '.' '.' '.' '.' '.' '.' '.' '.' '
if len(imgs) > 0:
cover = imgs[0]
# [Data - movie screenshot]
if len(imgs) > 1:
screen_shot = imgs[1]
Div [@id='zoom']
infos = zoom_element.xpath('.//text()')
# function to parse concrete content
def parse_info(info, rule) :
return info.replace(rule, ' ').strip()
Walk through every infOS item to get useful data
for key, info in enumerate(infos):
# print(' format(key) ')
# print(info)
# print (' end = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ')
if info.startswith('when s') :# s
year = parse_info(info, 'when s')
elif info.startswith('when origin') :# origin
country = parse_info(info, 'when origin')
elif info.startswith('when class') :# category
type = parse_info(info, 'when class')
elif info.startswith('◎ Douban Rating ') :# Douban rating
rating = parse_info(info, '◎ Douban Rating ')
elif info.startswith('when running time') :# running time
duration = parse_info(info, 'when running time')
elif info.startswith('when the director') :# director
director = parse_info(info, 'when the director')
elif info.startswith('when star') :# actor [The first actor]
actor_first = parse_info(info, 'when star')
actors = [actor_first]
# Continue to traverse below
for index in range(key + 1.len(infos)):
item = infos[index].strip()
if item.startswith('when profile') :break
# Get all the actors
# print(item)
actors.append(item)
elif info.startswith('when profile') :# desc = parse_info(info, '0 ')
for index in range(key + 1.len(infos)):
item = infos[index].strip()
if item.startswith('[Download address]') :break
desc = item
print(detail_url)
# Download address
if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0:
download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()') [0]
elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0:
download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()') [0]
film = {
'title': title,
'cover': cover,
'screen_shot': screen_shot,
'year': year,
'country': country,
'type': type.'rating': rating,
'duration': duration,
'director': director,
'actors': actors,
'desc': desc,
'download_url': download_url
}
return film
def spider() :
base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
films = []
for index in range(2.3):
url = base_url.format(index)
detail_urls = get_detail_urls(url)
for key, detail_url in enumerate(detail_urls):
# print('索引:' + str(key) + ',地址:' + detail_url)
# print(' parse detail page :' + detail_url)
film = parse_detail_page(detail_url)
films.append(film)
time.sleep(1)
print(films)
if __name__ == '__main__':
spider()
Copy the code