preface
Node makes crawling easy with third-party open source libraries.
Introduction to third-party Libraries
- Request Encapsulates network requests
- Cheerio Node version of jQuery
- Mkdirp creates a multilayer folder directory
Implementation approach
- through
request
Gets the specified URL content - through
cheerio
Find the jump path in the page (de-duplication) - through
mkdirp
Create a directory - through
fs
Create a file to write what is read - Repeat the above steps to retrieve the unaccessed path
Code implementation
const fs = require("fs");
const path = require("path");
const request = require("request");
const cheerio = require("cheerio");
const mkdirp = require("mkdirp");
// Define the entry URL
const homeUrl = "https://www.baidu.com";
// Define a set to store already accessed paths to avoid duplicate access
const set = new Set([homeUrl]);
function grab(url) {
// Verify url normality
if(! url)return;
/ / to space
url = url.trim();
// Automatically complete the URL path
if (url.endsWith("/")) {
url += "index.html";
}
const chunks = [];
// The URL may contain some symbols or Chinese characters, which can be encoded by encodeURI
request(encodeURI(url))
.on("error".(e) = > {
// Prints an error message
console.log(e);
})
.on("data".(chunk) = > {
// Receive the response content
chunks.push(chunk);
})
.on("end".() = > {
// Convert the corresponding content to text
const html = Buffer.concat(chunks).toString();
// No content was retrieved
if(! html)return;
/ / url
let { host, origin, pathname } = new URL(url);
pathname = decodeURI(pathname);
// Parsing HTML through cheerio
const $ = cheerio.load(html);
// use the path as a directory
const dir = path.dirname(pathname);
// Create directory
mkdirp.sync(path.join(__dirname, dir));
// Write to the file
fs.writeFile(path.join(__dirname, pathname), html, "utf-8".(err) = > {
// Prints an error message
if (err) {
console.log(err);
return;
}
console.log(` [${url}] Saved successfully);
});
// Get all the A elements in the page
const aTags = $("a");
Array.from(aTags).forEach((aTag) = > {
// Get the path to the a tag
const href = $(aTag).attr("href");
// Use this to verify the href is valid or to control the range of sites to be crawled
// Exclude empty labels
if(! href)return;
// Exclude anchor connections
if (href.startsWith("#")) return;
if (href.startsWith("mailto:")) return;
// If you do not want to save the image can be filtered out
// if (/\.(jpg|jpeg|png|gif|bit)$/.test(href)) return;
// href must be the entry URL domain name
let reg = new RegExp(`^https? : \ \ /${host}`);
if (/^https? : \ \ / / /.test(href) && ! reg.test(href))return;
// More logic can be added as needed
let newUrl = "";
if (/^https? : \ \ / / /.test(href)) {
// Handle absolute paths
newUrl = href;
} else {
// Handle relative paths
newUrl = origin + path.join(dir, href);
}
// Check whether access has been made
if (set.has(newUrl)) return;
if (newUrl.endsWith("/") && set.has(newUrl + "index.html")) return;
if (newUrl.endsWith("/")) newUrl += "index.html";
set.add(newUrl);
grab(newUrl);
});
});
}
// Start fetching
grab(homeUrl);
Copy the code
conclusion
Simple web crawler is complete, you can change the homeUrl into their own want to crawl the site try. If you have any questions, you can add me to wechat: RJJs1221