Database preparation

Use the mysql relational database
Installing dependency packages

  "dependencies": {
  "@types/sequelize": "^ 4.28.9"."jquery": "^ 3.6.0".// Parse the DOM structure
  "mysql2": "^ 2.2.5." "."puppeteer": "^ 9.1.1." "."sequelize": "6.5.0"."tedious": "^ 11.0.8"
}
Copy the code

Database connection

Use Sequelize (ORM framework) to connect

   const { Sequelize } = require('sequelize')
   const sequelize = new Sequelize('database'.'root'.'root', { // Change your database name and password
    host: 'localhost'.dialect: 'mysql'
   })

module.exports = sequelize
Copy the code

Define the model

const sequelize = require('./db')
const { DataTypes } = require('sequelize')

module.exports = sequelize.define("Profile", {
url: {
  type: DataTypes.STRING,
  allowNull: false}})Copy the code

Puppeteer profile

The crawler tool
See the links below

Chinese Website

Use Puppeteer to crawl web content

const puppeteer = require('puppeteer')
const$=require('jquery')
function sleep(delay) {
  return new Promise(resolve= > setTimeout(resolve, delay));
}
// Crawl the path of the page
const url = `https://www.woyaogexing.com/touxiang/katong/index.html`;
module.exports = async() = > {try {
    console.log('start visit the target page');
    const browser = await puppeteer.launch({
      args: ['--no-sandbox'].// Not sandbox mode
      dumpio: false.headless: false // Whether to run in browser headless mode. True indicates that the browser is not opened. The default value is true
    });
    //args: Other parameters passed to the Chrome instance, such as "-- ash-host-window-bounds=1024x768" to set the browser window size. More parameter lists can be found here
    // Dumpio imports browser processes stdout and stderr into process.stdout and process.stderr. The default is false.
    const page = await browser.newPage();
    await page.goto(url, {
      waitUntil: 'networkidle2' // Wait for the page to stop moving, indicating that the load is complete
    });
    let arr = [] // Save the captured content
    await sleep(3000);
    await page.waitForSelector('.page a:nth-last-of-type(1)'); // Asynchronously, wait until the element is loaded, otherwise the asynchronously loaded element cannot be retrieved
    for (let i = 0; i < 1; i++) {
      // Set the wait time
      await sleep(5000);
      let a1 = await pageSpider()
      a1.forEach(item= > arr.push(item))
      await page.click('.page a:nth-last-of-type(1)'); // Click the button once
    }
    async function pageSpider() {
      const result = await page.evaluate(() = > {
        var items = $('.pMain .txList a.img ');
        var links = [];
        // Check if the list has a value
        if (items.length >= 1) {
          console.log(items, 'items')
          items.each((index, item) = > {
            let it = $(item);
            let url = 'https:' + it.find('img').attr('src')
            links.push({
              url
            });
          });
        }
        return links.filter(item= > Object.keys(item).length > 0);
      });
      return result
    }
    browser.close();
    return arr
  } catch (err) {
    console.log(err); }};Copy the code

Note: the DOM structure of different web pages is not the same and needs to be analyzed separately

Import the data into the database

// Load the crawler file
const Profile = require('./models/profile')
const result = require('./spider')
result().then(r= > Profile.bulkCreate(r)

Copy the code

Dumped data structure

Project address github.com/ch1ies/Spid…

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

Puppeteer is used to capture the data in the web page and import it into the mysql database

Database preparation

Database connection

Puppeteer profile

Use Puppeteer to crawl web content

Import the data into the database

Puppeteer is used to capture the data in the web page and import it into the mysql database

Database preparation

Database connection

Puppeteer profile

Use Puppeteer to crawl web content

Import the data into the database

Related Posts

I, uh, have to cut up these two million pages and put them online

Draw points for WebGL

【 答 案 】 D

【答案】 D