Database preparation

  1. Use the mysql relational database
  2. Installing dependency packages
  "dependencies": {
  "@types/sequelize": "^ 4.28.9"."jquery": "^ 3.6.0".// Parse the DOM structure
  "mysql2": "^ 2.2.5." "."puppeteer": "^ 9.1.1." "."sequelize": "6.5.0"."tedious": "^ 11.0.8"
}
Copy the code

Database connection

  1. Use Sequelize (ORM framework) to connect
   const { Sequelize } = require('sequelize')
   const sequelize = new Sequelize('database'.'root'.'root', { // Change your database name and password
    host: 'localhost'.dialect: 'mysql'
   })

module.exports = sequelize
Copy the code
  1. Define the model
const sequelize = require('./db')
const { DataTypes } = require('sequelize')

module.exports = sequelize.define("Profile", {
url: {
  type: DataTypes.STRING,
  allowNull: false}})Copy the code

Puppeteer profile

  • The crawler tool
  • See the links below

Chinese Website

Use Puppeteer to crawl web content

const puppeteer = require('puppeteer')
const$=require('jquery')
function sleep(delay) {
  return new Promise(resolve= > setTimeout(resolve, delay));
}
// Crawl the path of the page
const url = `https://www.woyaogexing.com/touxiang/katong/index.html`;
module.exports = async() = > {try {
    console.log('start visit the target page');
    const browser = await puppeteer.launch({
      args: ['--no-sandbox'].// Not sandbox mode
      dumpio: false.headless: false // Whether to run in browser headless mode. True indicates that the browser is not opened. The default value is true
    });
    //args: Other parameters passed to the Chrome instance, such as "-- ash-host-window-bounds=1024x768" to set the browser window size. More parameter lists can be found here
    // Dumpio imports browser processes stdout and stderr into process.stdout and process.stderr. The default is false.
    const page = await browser.newPage();
    await page.goto(url, {
      waitUntil: 'networkidle2' // Wait for the page to stop moving, indicating that the load is complete
    });
    let arr = [] // Save the captured content
    await sleep(3000);
    await page.waitForSelector('.page a:nth-last-of-type(1)'); // Asynchronously, wait until the element is loaded, otherwise the asynchronously loaded element cannot be retrieved
    for (let i = 0; i < 1; i++) {
      // Set the wait time
      await sleep(5000);
      let a1 = await pageSpider()
      a1.forEach(item= > arr.push(item))
      await page.click('.page a:nth-last-of-type(1)'); // Click the button once
    }
    async function pageSpider() {
      const result = await page.evaluate(() = > {
        var items = $('.pMain .txList a.img ');
        var links = [];
        // Check if the list has a value
        if (items.length >= 1) {
          console.log(items, 'items')
          items.each((index, item) = > {
            let it = $(item);
            let url = 'https:' + it.find('img').attr('src')
            links.push({
              url
            });
          });
        }
        return links.filter(item= > Object.keys(item).length > 0);
      });
      return result
    }
    browser.close();
    return arr
  } catch (err) {
    console.log(err); }};Copy the code

Note: the DOM structure of different web pages is not the same and needs to be analyzed separately

Import the data into the database

// Load the crawler file
const Profile = require('./models/profile')
const result = require('./spider')
result().then(r= > Profile.bulkCreate(r)

Copy the code

Dumped data structure

Project address github.com/ch1ies/Spid…