The crawler case
- Case Study of crawlers
- Get the data that the target site needs to be hacked
- Analyzing website content (Cheerio)
- Obtain valid information download or other operations
- Go straight to code
- Import the module main module
- Install cheerio third-party module, which can use jQ syntax to filter HTML tags
- Get to where the home page url all content home page content, batch climb into, line splicing
- Using Cheerio, get the attribute SRC corresponding to img
- According to the SRC content, climb again, and then save to the directory folder
// First introduce the required modules HTTP, HTTPS, fs, cheerio
// Download the dependencies in advance, and you can use the jQ selector to get the content
// npm i cheerio
const http = require('http');
const https = require('https');
const fs = require('fs');
const cheerio = require('cheerio')
Copy the code
const nodeUrl = 'http://hotel.qunar.com/?kwid=47392517&cooperate=baidu';
http.get(nodeUrl,(res) = >{
To request / / check to see whether the return code and type ok * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
const {statusCode} = res;
const contentType = res.headers['content-type'];
console.log(statusCode,contentType);
let err = null;
if(statusCode ! = =200){
err = new Error('Request status code error');
}else if(!/^text\/html/.test(contentType) ){
err = new Error('Request type error');
}
if(err){
console.log('Current request Error: +${err}`);
res.resume(); // Clear the cache
return false;
}
/ / access to and operation to the contents of the * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
let rowData = ' '
res.on('data'.(chunk) = >{
rowData += chunk;
// console.log(' Data transfer OK');
});
res.on('err'.(err) = >{
console.log('Data transmission error:' + err);
});
res.on('end'.() = >{
// Save the single file **************************************
// fs.writeFile('./qunale.html', rowData,(err)=>{
// if(err){
// console.log('err: ' + err);
// }else{
// console.log(' Write succeeded ')
/ /}
// })
let $ = cheerio.load(rowData);
// rowData crawls to content to handle single file crawls
// let arr = $('img').attr('src');
// console.log(arr);
// https.get(arr,(response)=>{
// var imgData = "";
// response.setEncoding("binary");
// response.on("data", function(chunk){
// imgData += chunk;
/ /});
// response.on("end", function(){
// fs.writeFile("./Image/age.jpg", imgData, "binary", function(err){
// if(err){
// console.log("down fail:" + err);
// }else{
// console.log("down success");
/ /}
/ /});
/ /});
// })
// Crawl to content handle multiple file crawls ******************************
$('img').each( (index, el) = >{
console.log($(el).attr('src'));
// Download the crawl image
let str = $(el).attr('src');
if( !str.startsWith('https') ){
str = 'https:' + str;
}
https.get(str,(response) = >{
var imgData = "";
response.setEncoding("binary"); // Make sure that the encoding of response is binary or the downloaded image cannot be opened
response.on("data".function(chunk){
imgData += chunk;
});
response.on("end".function(){
fs.writeFile(`./Image/age${index}.jpg`, imgData, "binary".function(err){
if(err){
console.log("down fail:" + err);
}else{
console.log("down success"); }}); }); })})console.log('Data transfer completed:');
})
}).on('error'.(error) = >{
console.log(error);
})
Copy the code
Picture crawl results