1. Douban top250
This is the third day of my participation in Gwen Challenge
# douban top250
Get movie titles, reviews, and images
import requests
from lxml import etree
class DouBan(object) :
def __init__(self, url) :
self.url = url
self.headers = {
'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
def parse_url(self) : # get source code
response = requests.get(url=self.url, headers=self.headers)
return response.content.decode()
def parse_str(self, sound_code) : # Extract data
html = etree.HTML(sound_code)
li_list = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
item = []
for li in li_list:
title = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
film_review = li.xpath('div/div[2]/div[2]/p[2]/span/text()')
if film_review == None:
continue
img = li.xpath('div/div[1]/a/img/@src')
item.append(title)
item.append(film_review)
item.append(img)
return item
def save_html(self, content_list) : Save the source code of the page
with open("douban.html"."w", encoding="utf-8")as f:
f.write(content_list)
def run(self) : # start
# while True:
sound_code = self.parse_url()
self.save_html(sound_code)
content_list = self.parse_str(sound_code)
# save = self.save_str(content_list)
print(content_list)
if __name__ == '__main__':
for i in range(0.226.25):
url = "https://movie.douban.com/top250?start=" + str(i)
spider = DouBan(url)
spider.run()
Copy the code