Currently, most crawler tutorials are based on Python and Node.js. In fact, as long as you have a Chrome browser, Devtools with Chrome F12 open can easily write a crawler anywhere, anytime, no need to install any other locale. Today we’ll take a look at a crawler that uses only Chrome Devtools to crawl the www.biqudu.com/31_31729/ novel and save it as a text file.

How to write crawler code in Chrome Devtools

Devtools provides Snippets so we can write JavaScript code here, as shown below:

Step-by-step instructions

  1. Open the Source TAB
  2. Select the Snippets TAB on the left
  3. Click New Snippets to create a New Snippets
  4. Start coding
  5. Hit Run code
  6. View console output

Prepare the crawler function

1. Load the third-party library

Use underscore to load a third-party library based on the Url, and then use the libraries in your code. In this case, use the function to load the async asynchronous concurrency control library.

async function loadLibrary(url) {
    return new Promise((resolve, reject) => {
        let script = document.createElement('script');
        script.onload = resolve;
        script.onerror = reject;
        script.src = url;
        document.body.appendChild(script);
    });
}Copy the code

2. Download the file to the local PC

Download string to a text file

function saveFile(string, fileName) {
    var a = document.createElement('a');
    a.download = fileName;
    var blob = new Blob([string], {
        type: 'text/plain'
    });
    a.href = window.URL.createObjectURL(blob);
    a.click();
}Copy the code

3. Download the HTML

The Fetch API is used to download an HTML text file according to the URL, convert it into a DOM element and then return it. The returned element has A DOM API, such as querySelector, which facilitates the extraction and analysis of nodes.

async function getHtml(url) {
    let response = await fetch(url);
    let htmlText = await response.text();
    let html = document.createElement('html');
    html.innerHTML = htmlText;
    return html;
}Copy the code

Prepare the crawler business function

1. Get information about all the chapters of the novel

Analysis of the novel home page www.biqudu.com/31_31729/, through the document. QuerySelectorAll (‘ # list dd a ‘) to get a tag elements containing all of the section name and link.

async function getDirectory(url) { let page = await getHtml(url); let directory = Array.from(page.querySelectorAll('#list dd a')); Return directory.slice(12); }Copy the code

2. Get a chapter

Analysis of novel chapters www.biqudu.com/31_31729/21… The chapter content is in the DIV element with the ID content

Async function getSection({href, innerText: title}) {console.log(' start fetching ${title} '); let html = await getHtml(href); let content = html.querySelector('#content'); Array.from(content.querySelectorAll('script')).forEach(scriptTag => content.removeChild(scriptTag)); var text = title + '\r\n' + content.innerText + '\r\n'; return text; }Copy the code

The complete code

Because the novel has hundreds of thousands of chapters, it’s impossible to download one chapter at a time. That would be too slow. You can’t download them all at once. So the async async library is used for crawling, and the number of concurrent requests is 6. If you set the number of concurrent requests to 6, it will not be useful because Chrome browser can make simultaneous requests for the same domain name.

Complete code run steps

  1. Chrome opens the novel home page such as www.biqudu.com/31_31729/
  2. Open Devtools new snippets on the novel home page and paste the full code below
  3. Click run code to start climbing the novel
(async function () { // https://www.biqudu.com/31_31729/ async function loadLibrary(url) { return new Promise((resolve, reject) => { let script = document.createElement('script'); script.onload = resolve; script.onerror = reject; script.src = url; document.body.appendChild(script); }); } function saveFile(string, fileName) { var a = document.createElement('a'); a.download = fileName; var blob = new Blob([string], { type: 'text/plain' }); a.href = window.URL.createObjectURL(blob); a.click(); } async function getHtml(url) { let response = await fetch(url); let htmlText = await response.text(); let html = document.createElement('html'); html.innerHTML = htmlText; return html; } async function getDirectory(url) { let page = await getHtml(url); let directory = Array.from(page.querySelectorAll('#list dd a')); Return directory.slice(12); } async function getSection({href, innerText: title}) {console.log(' start fetching ${title} '); let html = await getHtml(href); let content = html.querySelector('#content'); Array.from(content.querySelectorAll('script')).forEach(scriptTag => content.removeChild(scriptTag)); var text = title + '\r\n' + content.innerText + '\r\n'; return text; } async function the run () {let asyncLibUrl = 'https://cdn.bootcss.com/async/2.1.4/async.js'; await loadLibrary(asyncLibUrl); let directory = await getDirectory(location.href); let q = window.async.queue(async function (section, taskDone) { try { section.text = await getSection(section); } catch (e) { console.error(e); Section. text = "section download failed:" + e; } finally { taskDone(); }}, 6); Q.drain = function () {let name = document.querySelector('#maininfo h1').innertext + '.txt'; Console. log(' ${name} 'download completed); let content = ""; directory.forEach(function ({ text }) { content += text; }); saveFile(content, name); } q.push(directory); } await run(); } ());Copy the code

Summary of the pros and cons of crawlers in Chrome Devtools

advantages

  • You can analyze nodes directly using the browser’s native DOM API
  • IndexDB, WebSQL database
  • Cookie support directly
  • Strong development environment
  • Suitable for simple crawlers
  • .

disadvantages

  • Same-origin policy restriction
  • Concurrency limit (Chrome domain name concurrency limit is 6)
  • .

Welcome toMy lotFind something interesting