Database preparation
- Use the mysql relational database
- Installing dependency packages
"dependencies": {
"@types/sequelize": "^ 4.28.9"."jquery": "^ 3.6.0".// Parse the DOM structure
"mysql2": "^ 2.2.5." "."puppeteer": "^ 9.1.1." "."sequelize": "6.5.0"."tedious": "^ 11.0.8"
}
Copy the code
Database connection
- Use Sequelize (ORM framework) to connect
const { Sequelize } = require('sequelize')
const sequelize = new Sequelize('database'.'root'.'root', { // Change your database name and password
host: 'localhost'.dialect: 'mysql'
})
module.exports = sequelize
Copy the code
- Define the model
const sequelize = require('./db')
const { DataTypes } = require('sequelize')
module.exports = sequelize.define("Profile", {
url: {
type: DataTypes.STRING,
allowNull: false}})Copy the code
Puppeteer profile
- The crawler tool
- See the links below
Chinese Website
Use Puppeteer to crawl web content
const puppeteer = require('puppeteer')
const$=require('jquery')
function sleep(delay) {
return new Promise(resolve= > setTimeout(resolve, delay));
}
// Crawl the path of the page
const url = `https://www.woyaogexing.com/touxiang/katong/index.html`;
module.exports = async() = > {try {
console.log('start visit the target page');
const browser = await puppeteer.launch({
args: ['--no-sandbox'].// Not sandbox mode
dumpio: false.headless: false // Whether to run in browser headless mode. True indicates that the browser is not opened. The default value is true
});
//args: Other parameters passed to the Chrome instance, such as "-- ash-host-window-bounds=1024x768" to set the browser window size. More parameter lists can be found here
// Dumpio imports browser processes stdout and stderr into process.stdout and process.stderr. The default is false.
const page = await browser.newPage();
await page.goto(url, {
waitUntil: 'networkidle2' // Wait for the page to stop moving, indicating that the load is complete
});
let arr = [] // Save the captured content
await sleep(3000);
await page.waitForSelector('.page a:nth-last-of-type(1)'); // Asynchronously, wait until the element is loaded, otherwise the asynchronously loaded element cannot be retrieved
for (let i = 0; i < 1; i++) {
// Set the wait time
await sleep(5000);
let a1 = await pageSpider()
a1.forEach(item= > arr.push(item))
await page.click('.page a:nth-last-of-type(1)'); // Click the button once
}
async function pageSpider() {
const result = await page.evaluate(() = > {
var items = $('.pMain .txList a.img ');
var links = [];
// Check if the list has a value
if (items.length >= 1) {
console.log(items, 'items')
items.each((index, item) = > {
let it = $(item);
let url = 'https:' + it.find('img').attr('src')
links.push({
url
});
});
}
return links.filter(item= > Object.keys(item).length > 0);
});
return result
}
browser.close();
return arr
} catch (err) {
console.log(err); }};Copy the code
Note: the DOM structure of different web pages is not the same and needs to be analyzed separately
Import the data into the database
// Load the crawler file
const Profile = require('./models/profile')
const result = require('./spider')
result().then(r= > Profile.bulkCreate(r)
Copy the code
Dumped data structure
Project address github.com/ch1ies/Spid…