As a beginner of Node.js, THE top250 of Douban movie can be climbed. A simple crawler can be achieved by referring to Node.js, and important modifications can be made

  1. The original crawling method is no longer applicable, because the domain name of Douban has been changed from HTTP to HTTPS, and the dom rating of Douban pages has been changed.
  2. The original blog post could only crawl the first page, but here I used the JS generator and then used a loop to crawl the whole thing.


First introduce modules



var http = require('http'),
	https = require('https'),
	fs = require('fs'), 
	path = require('path'),
	cheerio = require('cheerio'); 
	Copy the code

Define the URL information to crawl, if it is an HTTP site.


var opt = {
    hostname: 'movie.douban.com',
    path: '/top250',
    port: 80
    };
Copy the code

Create an HTTPS GET request

function spiderMovie(index) { https.get('https://movie.douban.com/top250?start=' + index, function (res) { var pageSize = 25; var html = ''; var movies = []; res.setEncoding('utf-8'); res.on('data', function (chunk) { html += chunk; }); res.on('end', function () { var $ = cheerio.load(html); $('.item').each(function () { var picUrl = $('.pic img', this).attr('src'); var movie = { title: $('.title', this).text(), star: $('.info .star .rating_num', this).text(), link: $('a', this).attr('href'), picUrl: picUrl }; if (movie) { movies.push(movie); } downloadImg('.. /img/', movie.picUrl); }); saveData('./data' + (index / pageSize) + '.json', movies); }); }).on('error', function (err) { console.log(err); }); }Copy the code

Download the pictures

@param {string} url Url of imgDir */ function downloadImg(imgDir, param {string}) url) { https.get(url, function (res) { var data = ''; res.setEncoding('binary'); res.on('data', function (chunk) { data += chunk; }); res.on('end', function () { fs.writeFile(imgDir + path.basename(url), data, 'binary', function (err) { if (err) { return console.log(err); } console.log('Image downloaded: ', path.basename(url)); }); }); }).on('error', function (err) { console.log(err); }); }Copy the code

Save data locally

* @param {string} path * @param {array} movies */ function saveData(path, @param {array} movies) movies) { console.log(movies); fs.writeFile(path, JSON.stringify(movies, null, ' '), function (err) { if (err) { return console.log(err); } console.log('Data saved'); }); }Copy the code

Create a crawl generator

function *doSpider(x) { var start = 0; console.log(start + ' -------------------------------'); while (start < x) { yield start; spiderMovie(start); start += 25; }}Copy the code

Perform the crawl method


for (var x of doSpider(250)) {
    console.log(x); 
    }
Copy the code

Crawl results

That’s it!