First, the basic use of scheduled tasks

  1. Create a new schedule folder under app folder and create watchfile.js under watchfile.js (you can customize it)
const Subscription = require('egg').Subscription;

let i = 0;

class WatchFile extends Subscription {
  static get schedule() {
    return {
      interval: '1s'.type: 'all'      // specify that all processes need to be executed}}async subscribe() {
    i++;
    console.log(i); }}module.exports = WatchFile;
Copy the code
  1. The console will print the following every 1s

  1. Here’s a shorthand
let i = 0;
module.exports = {
  schedule: {
    interval: '1s'.// 1 minute interval
    type: 'all'.// specify that all workers need to be executed
  },
  async task(ctx) {
    i++;
    console.log(i); }};Copy the code

Two, regularly climb the content of the specified web page

  1. Install the Cheerio module

This module parses the contents of HTML pages using JQuery syntax.

npm install cheerio
Copy the code
  1. The import module
const cheerio = require('cheerio');
Copy the code
  1. Define the crawler module spider.js in the service
'use strict';

const Service = require('egg').Service;

class SpiderService extends Service {
  async requestUrl(url) {
    const result = await this.ctx.curl(url);
    returnresult; }}module.exports = SpiderService;
Copy the code
  1. The scheduled task parses the content obtained by the service
module.exports = {
  schedule: {
    interval: '1s'.// 1 minute interval
    type: 'all'.// specify that all workers need to be executed
  },
  async task(ctx) {
    const url = "https://news.baidu.com";
    const result = await ctx.service.spider.requestUrl(url);
    const htmlData = result.data.toString();
    const $ = cheerio.load(htmlData,{decodeEntities: false});

    $('.hotnews a').each(function() {
      console.log($(this).html()); })}};Copy the code