background
I woke up on Saturday morning at cui hua’s telephone bombing…
Such a nerve-racking (stupid) question to ask so early in the morning
I was going to get rid of it and resume my sleep, but…
Of course I’m going to help him!
I frowned at the red exclamation mark
When I opened the site, I found exactly what I expected: colorful GIF images that almost blinded my titanium dog eyes
After a few glances, call it a day
The text start
The modules used
- HTTP: creates services and handles flow correlation
- Fs: Operating files and folders (read, write)
- Cheerio: Simply and roughly understood as juquey of node
Start by crawling the entire page
In order to the normal issue of the article cuihua provided links into the webmaster home links to demonstrate
// Import the required modules
var http = require('http');
var cheerio = require('cheerio');
var fs = require('fs');
// Define the crawl destination station
var Url = 'http://sc.chinaz.com/tupian/'
http.get(Url, function (res) {
var htmlDate = ' ';
// Get page data
res.on('data'.function (chunk) {
htmlDate += chunk;
});
// Data fetching is complete
res.on('end'.function () {
// Filter out the desired elements
filterContent(htmlDate);
});
}).on('error'.function () {
console.log('Error getting data! ');
});
Copy the code
filter
Analyze the structure of the page to see which images are needed to get this node in #container
Iterate over the. Box and get SRC and Alt for a > img
// Filter page information
function filterContent(htmlDate) {
if (htmlDate) {
var $ = cheerio.load(htmlDate);
// Get the desired content
var Content = $('#container');
// Store the information captured later
var ContentData = [];
Content.find('.box').each(function (item, b) {
var pic = $(this);
// why src2? SRC could not get a print of an existing SRC2 delivered
var src = formatUrl(pic.find('a').children('img').attr('src2'));
var name = formatUrl(pic.find('a').children('img').attr('alt'));
// Pass the captured information to the download function for download
download(src, name)
// Put a copy here too
ContentData.push({
src,
name
})
});
// Store the captured image information
console.log(ContentData)
} else {
console.log('html null'); }}Copy the code
Crawl links with _s are thumbnails that require a method helper to convert
// Or take the HD link
function formatUrl(imgUrl) {
return imgUrl.replace('_s'.' ')}Copy the code
// Image download function
function download(url, name) {
http.get(url, function (res) {
let imgData = ' ';
// Set the image encoding format
res.setEncoding("binary");
// Check the requested data
res.on('data', (chunk) => {
imgData += chunk;
})
res.on('end', () = > {// No folder is created to prevent errors
if(! fs.existsSync('./images')) {
fs.mkdirSync('./images');
};
fs.writeFile(`./images/${name}.jpg`, imgData, 'binary', (error) => {
if (error) {
console.log(error);
} else {
console.log(`${name}---- download successful! `)}})})})}Copy the code