This article is participating in Python Theme Month. See the link for details

The preparatory work

Before we start, we need to assemble the corresponding environment and so on.

Because docker already exists locally, directly pull JupyterHub image for local running, using JupyterHub code writing.

  1. First of all, to correspondWeb sitePull the mirror image.
  2. Will run the corresponding image.
  3. Use a browser to open and log in. Account:jovyanPassword:hipaddle. (Password can be entered into the container:/etc/jupyter/jupyterhub_config.pyView)

The code

We first get resources are from the movie heaven, we need to be familiar with the page, at the same time we also need to look at the page source elements, here is obtained by parsing the source elements of the page.

Analyze requirements

  1. Preferred to view the home page, movie page listurlThe rule.
  2. Go to the details of each movie to see the location of the desired element in the source code of the web page.
  3. Assemble each movie element into oneobj, continue to find the location of the next movie element.

Write the code

  1. First, we need to fix the page number of the document we need to loop in:
            for index in range(2.3) :# 1. The url of the movie list
		url = base_url.format(index)
		# 2. Get all movies on the current page
		detail_urls = get_detail_urls(url)
		# 3. Parse the details page for each movie
		for key, detail_url in enumerate(detail_urls):
			# print('索引:' + str(key) + ',地址:' + detail_url)
			# print(' parse detail page :' + detail_url)
			film = parse_detail_page(detail_url)
			films.append(film)
		# 4. Sleep for 2 seconds each time you climb a page
		time.sleep(1)
Copy the code
  1. Get resources for the detail page
.# [Data - Movie title]
                title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()') [0]

	# Get the zoom TAB
	zoom_element = html_element.xpath('//div[@id="Zoom"]') [0]

	# [Data - movie covers and screenshots]
	imgs = zoom_element.xpath(".//img/@src")...Copy the code
  1. The corresponding information is assembled
	film = {
		'title': title,
		'cover': cover,
		'screen_shot': screen_shot,
		'year': year,
		'country': country,
		'type': type.'rating': rating,
		'duration': duration,
		'director': director,
		'actors': actors,
		'desc': desc,
		'download_url': download_url
	}
        ......
Copy the code

Effect of screenshots

Complete code:

import requests
from lxml import etree
import time

# home address
BASE_DOMAIN = 'http://www.dytt8.net'

HEADERS = {
	'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',}def get_detail_urls(url) :
	response = requests.get(url, headers=HEADERS)
	html_element = etree.HTML(response.text)
	# data - String list details page address
	detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href')
	# Filter out dirty data caused by [composite movie]
	detail_urls_new = detail_urls
	for index, detail_url in enumerate(detail_urls_new):
		if detail_url == '/html/gndy/jddy/index.html':
			detail_urls.remove(detail_url)
	The address of the assembly details page
	detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls)
	return detail_urls


def parse_detail_page(detail_url) :
	response = requests.get(detail_url, headers=HEADERS)
	text = response.content.decode('gb18030')
	html_element = etree.HTML(text)

	# [Data - Movie title]
	title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()') [0]

	# Get the zoom TAB
	zoom_element = html_element.xpath('//div[@id="Zoom"]') [0]

	# [Data - movie covers and screenshots]
	imgs = zoom_element.xpath(".//img/@src")

	# Note: To avoid dirty data causing the application to hang, early initialization
	year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = ' '.' '.' '.' '.' '.' '.' '.' '.' '.' '

	if len(imgs) > 0:
		cover = imgs[0]

	# [Data - movie screenshot]
	if len(imgs) > 1:
		screen_shot = imgs[1]

	Div [@id='zoom']
	infos = zoom_element.xpath('.//text()')

	# function to parse concrete content
	def parse_info(info, rule) :
		return info.replace(rule, ' ').strip()

	Walk through every infOS item to get useful data
	for key, info in enumerate(infos):

		# print(' format(key) ')
		# print(info)
		# print (' end = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ')

		if info.startswith('when s') :# s
			year = parse_info(info, 'when s')
		elif info.startswith('when origin') :# origin
			country = parse_info(info, 'when origin')
		elif info.startswith('when class') :# category
			type = parse_info(info, 'when class')
		elif info.startswith('◎ Douban Rating ') :# Douban rating
			rating = parse_info(info, '◎ Douban Rating ')
		elif info.startswith('when running time') :# running time
			duration = parse_info(info, 'when running time')
		elif info.startswith('when the director') :# director
			director = parse_info(info, 'when the director')
		elif info.startswith('when star') :# actor [The first actor]
			actor_first = parse_info(info, 'when star')

			actors = [actor_first]

			# Continue to traverse below
			for index in range(key + 1.len(infos)):
				item = infos[index].strip()
				if item.startswith('when profile') :break
				# Get all the actors
				# print(item)
				actors.append(item)
		elif info.startswith('when profile') :# desc = parse_info(info, '0 ')

			for index in range(key + 1.len(infos)):
				item = infos[index].strip()
				if item.startswith('[Download address]') :break
				desc = item

	print(detail_url)

	# Download address
	if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0:
		download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()') [0]
	elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0:
		download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()') [0]

	film = {
		'title': title,
		'cover': cover,
		'screen_shot': screen_shot,
		'year': year,
		'country': country,
		'type': type.'rating': rating,
		'duration': duration,
		'director': director,
		'actors': actors,
		'desc': desc,
		'download_url': download_url
	}

	return film


def spider() :
	base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
	films = []
	for index in range(2.3):
	
		url = base_url.format(index)
		detail_urls = get_detail_urls(url)
		for key, detail_url in enumerate(detail_urls):
			# print('索引:' + str(key) + ',地址:' + detail_url)
			# print(' parse detail page :' + detail_url)
			film = parse_detail_page(detail_url)

			films.append(film)
		time.sleep(1)

	print(films)

if __name__ == '__main__':
	spider()
Copy the code