The previous article covered the lead-up to puppeteer, and in this article we will take a look at the screenshots

Official Demo

const puppeteer = require('puppeteer');
(async () = > {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('http://www.baidu.com');
  await page.screenshot({ path: 'baidu.png' });
  awaitbrowser.close(); }) ();Copy the code

Application scenarios

First, full screen screenshot

const puppeteer = require('puppeteer');
(async () = > {
  const options = {
    headless: true.timeout: 0.args: ['--no-sandbox'.'--disable-setuid-sandbox'].ignoreHTTPSErrors: true
  };
  const browser = await puppeteer.launch(options);
  const page = await browser.newPage();
  await page.goto('http://www.baidu.com', {
    / / https://pptr.dev/#?product=Puppeteer&version=v1.18.1&show=api-pagegotourl-options
    waitUntil: ['networkidle2'].timeout: 20 * 1000
  });

  // If waitUntil is 'networkidle2', asynchronous content cannot be fully loaded
  // You can manually add sliding operations or delay wait to process
  await page.evaluate((a)= > {
    return Promise.resolve(window.scrollTo(0.window.innerHeight));
  });

  // await page.waitFor(2000);

  await page.screenshot({
    path: 'baidu.png'.fullPage: true
  });
  awaitbrowser.close(); }) ();Copy the code

2. Exact screenshots of elements or screenshots of specified range

const puppeteer = require('puppeteer');
(async () = > {
  async function getElementBounding(page, element) {
    const pos = await page.$eval(element, e => {
      // implement the evaluate function in pageFunction
      // document.querySelector(element).getBoundingClientRect()
      / / https://pptr.dev/#?product=Puppeteer&version=v1.18.1&show=api-pageevalselector-pagefunction-args-1
      const { left, top, width, height } = e.getBoundingClientRect();
      return { left, top, width, height };
    });
    return pos;
  }

  const options = {
    headless: true.timeout: 0.args: ['--no-sandbox'.'--disable-setuid-sandbox'].ignoreHTTPSErrors: true
  };
  const browser = await puppeteer.launch(options);
  const page = await browser.newPage();
  await page.goto('http://www.baidu.com', {
    waitUntil: ['networkidle2'].timeout: 20 * 1000
  });
  await page.evaluate((a)= > {
    return Promise.resolve(window.scrollTo(0.window.innerHeight));
  });
  const pos = await getElementBounding(page, '.head_wrapper');
  await page.screenshot({
    path: 'baidu.png'.// Use it flexibly according to business application scenarios
    clip: {
      x: pos.left,
      y: pos.top,
      width: pos.width,
      height: pos.height
    }
  });
  awaitbrowser.close(); }) ();Copy the code

Performance optimization

In the case of complex and large volumes of business, there are usually hundreds or thousands of pages to process. We usually use promise.all () to handle asynchrony in parallel and take batch screenshots, but a large number of TAB pages will lead to a sharp decline in machine performance.

So we can open the number of browser and the number of pages of each browser are taken out, can be flexibly adjusted, convenient for different configuration of the machine in the execution of the task, to avoid the collapse of the machine.

/** * startBrowser startBrowser * @param {array} urlList data source * @param {number} browserNum number of open browsers * @param {number} tabNum Tabs opened by each browser * @return {array} */

  async startBrowser(urlList, browserNum, tabNum) {
    // A single browser executes the process
    const action = async (startPosition, length) => {
      const options = {
        headless: true.timeout: 0.args: [ '--no-sandbox'.'--disable-setuid-sandbox'].ignoreHTTPSErrors: true
      };

      const platform = os.platform().toLocaleLowerCase();

      if (platform === 'linux') {
        // Install path for Chromium on the environment
        options.executablePath = path.join(__dirname, '.. /.. /.. /chrome-linux/chrome');
      }

      const browser = await puppeteer.launch(options);

      // Process data source urlList
      const promises = [];
      for (let i = 0; i < length; i++) {
        const groupIndex = parseInt(i / tabNum, 10);
        promises[groupIndex] = promises[groupIndex] ? promises[groupIndex] : [];
        promises[parseInt(i / tabNum, 10)].push(urlList[i + startPosition]);
      }

      const pagesGroups = [];
      for (let i = 0; i < promises.length; i++) {
        pagesGroups.push(
          await Promise.all(
            promises[i].map(async option => {
              // Execute the screenshot logic of a single TAB page and return the data after processing according to their respective business scenarios
              return await this.startPage(browser, option); }))); }// Process callback data
      const pages = [];
      pagesGroups.map(pagesGroup= > {
        pagesGroup.map(page= > {
          pages.push(page);
        });
      });

      await browser.close();
      return pages;
    };

    // Split the data to be captured based on the number of open browsers
    const result = [], promiseArr = [];
    for (let i = 0, len = urlList.length; i < browserNum; i++) {
      let SingleLen = parseInt(len / browserNum, 0);
      const startPosition = SingleLen * i;
      if (i === browserNum - 1) {
        SingleLen = len - SingleLen * i;
      }
      promiseArr.push(action(startPosition, SingleLen));
    }
    const allList = await Promise.all(promiseArr);
    allList.map(item= > {
      item.map(subItem= > {
        result.push(subItem);
      });
    });
    return result;
  }
Copy the code

You can write your own cluster.js to handle puppeteer, for example:

const cluster = require('cluster');
const startBrowser = require('./startBrowser');
const numCPUs = require('os').cpus().length;

const urls = require('./testData.js');

(async () = > {
  if (cluster.isMaster) {
    for (let i = 0; i < numCPUs; i++) { cluster.fork(); }}else {
    // Split the business data to run per core
    let len = parseInt(urls.length / numCPUs, 0)
    let start = len * (cluster.worker.id - 1);
    if (cluster.worker.id === numCPUs) {
      len = urls.length - len * (numCPUs - 1);
    }
    await startBrowser(urls, start, len);
    cluster.worker.kill();
  }
})();
Copy the code

You can also use PM2 to manage node processes without having to write your own cluster.js logic

// http://pm2.keymetrics.io/docs/usage/pm2-doc-single-page/
{
  "apps": [{"name": "screenshot-demo"."script": "./index.js".// Import file
    "env":
    {
      "NODE_ENV": "production"
    },
    "error_file": "./logs/app-err.log"."out_file": "./logs/app-out.log"."log_date_format": "YYYY-MM-DD HH:mm Z"."instances" : 4.// Start the quantity process for automatic load balancing. Improve overall performance and performance stability
    "exec_mode" : "cluster"}}]Copy the code

Start the way

pm2 start pm2.json
Copy the code

Contact email: [email protected]