The crawler case

  • Case Study of crawlers
    1. Get the data that the target site needs to be hacked
    2. Analyzing website content (Cheerio)
    3. Obtain valid information download or other operations
  • Go straight to code
    1. Import the module main module
    2. Install cheerio third-party module, which can use jQ syntax to filter HTML tags
    3. Get to where the home page url all content home page content, batch climb into, line splicing
    4. Using Cheerio, get the attribute SRC corresponding to img
    5. According to the SRC content, climb again, and then save to the directory folder
// First introduce the required modules HTTP, HTTPS, fs, cheerio

// Download the dependencies in advance, and you can use the jQ selector to get the content
// npm i cheerio

const http = require('http');
const https = require('https');
const fs = require('fs');
const cheerio = require('cheerio')
Copy the code
const nodeUrl = 'http://hotel.qunar.com/?kwid=47392517&cooperate=baidu'; 

http.get(nodeUrl,(res) = >{

To request / / check to see whether the return code and type ok * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
    const {statusCode} = res;
    const contentType = res.headers['content-type'];
    console.log(statusCode,contentType);
    let err = null;

    if(statusCode ! = =200){
        err = new Error('Request status code error');
    }else if(!/^text\/html/.test(contentType) ){
        err = new Error('Request type error');
    }
    if(err){
        console.log('Current request Error: +${err}`);
        res.resume();  // Clear the cache
        return false;
    }
/ / access to and operation to the contents of the * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
    let rowData = ' '
    res.on('data'.(chunk) = >{
        rowData += chunk;
        // console.log(' Data transfer OK');
    });
    res.on('err'.(err) = >{
        console.log('Data transmission error:' + err);
    });
    res.on('end'.() = >{
    	// Save the single file **************************************
        // fs.writeFile('./qunale.html', rowData,(err)=>{
        // if(err){
        // console.log('err: ' + err);
        // }else{
        // console.log(' Write succeeded ')
        / /}
        // })
        
        let $ = cheerio.load(rowData);

        // rowData crawls to content to handle single file crawls
        // let arr = $('img').attr('src');
        // console.log(arr);
        // https.get(arr,(response)=>{
        // var imgData = "";
        // response.setEncoding("binary");
        // response.on("data", function(chunk){
        // imgData += chunk;
        / /});
        // response.on("end", function(){
        // fs.writeFile("./Image/age.jpg", imgData, "binary", function(err){
        // if(err){
        // console.log("down fail:" + err);
        // }else{
        // console.log("down success");
        / /}
        / /});
        / /});
        // })

        // Crawl to content handle multiple file crawls ******************************
        $('img').each( (index, el) = >{
            console.log($(el).attr('src'));
            // Download the crawl image
            let str = $(el).attr('src');
            if( !str.startsWith('https') ){
                str = 'https:' + str;
            }

                https.get(str,(response) = >{

                    var imgData = "";
                    response.setEncoding("binary"); // Make sure that the encoding of response is binary or the downloaded image cannot be opened
                    response.on("data".function(chunk){
                        imgData += chunk;
                    });
                
                    response.on("end".function(){

                        fs.writeFile(`./Image/age${index}.jpg`, imgData, "binary".function(err){
                            if(err){
                                console.log("down fail:" + err);
                            }else{
                                console.log("down success"); }}); }); })})console.log('Data transfer completed:');
    })
}).on('error'.(error) = >{
    console.log(error);
})
Copy the code

Picture crawl results