[toc]

Case description

Select SpringBoot, Mybatis plus, Webmagic from 51job and save it to mysql database.

Create a project

Introducing Maven dependencies

<? The XML version = "1.0" encoding = "utf-8"? > < project XMLNS = "http://maven.apache.org/POM/4.0.0" XMLNS: xsi = "http://www.w3.org/2001/XMLSchema-instance" Xsi: schemaLocation = "http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" > < modelVersion > 4.0.0 < / modelVersion > < the parent > < groupId > org. Springframework. Boot < / groupId > The < artifactId > spring - the boot - starter - parent < / artifactId > < version > 2.2.5. RELEASE < / version > < relativePath / > <! -- lookup parent from repository --> </parent> <groupId>com.hg</groupId> <artifactId>spider-demo</artifactId> <version>0.0.1-SNAPSHOT</version> <name>spider-demo</name> <description> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>cn. Hutool </groupId> <artifactId>hutool-all</artifactId> <version>5.1.0</version> </dependency> <! -- Druid database connection pool --> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> The < version > 1.1.10 < / version > < / dependency > <! -- mysql connector --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <! -- Mybatis-plus --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> The < version > 3.0.5 < / version > < / dependency > <! --webmagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> < version > 0.7.3 < / version > < / dependency > < the dependency > < groupId > us. Codecraft < / groupId > < artifactId > webmagic - the extension < / artifactId > < version > 0.7.3 < / version > < / dependency > < the dependency > <groupId>com.google.guava</groupId> <artifactId> <version>16.0</version> </dependency> </dependencies> </dependencies>  <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>Copy the code

Build table statements

Create a database spider and create a new table job_info

CREATE TABLE `job_info` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'primary key id',
  `company_name` varchar(100) DEFAULT NULL COMMENT 'Company Name',
  `company_addr` varchar(200) DEFAULT NULL COMMENT 'Company Contact information',
  `job_name` varchar(100) DEFAULT NULL COMMENT 'Job Title',
  `job_addr` varchar(50) DEFAULT NULL COMMENT 'Place of Work',
  `salary` varchar(50) DEFAULT NULL COMMENT 'Salary range',
  `url` varchar(150) DEFAULT NULL COMMENT 'Recruitment Details Page',
  `time` varchar(10) DEFAULT NULL COMMENT 'Last Post time',
  `job_detail` text COMMENT 'Job Details'.PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='Wanted Information';
Copy the code

Adding a Configuration File

Create application. Yml

spring: application: name: spider-servoce jackson: time-zone: GMT+8 date-format: yyyy-MM-dd HH:mm:ss datasource: driver-class-name: com.mysql.cj.jdbc.Driver url: jdbc:mysql://localhost:3306/spider? useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false username: root password: root type: com.alibaba.druid.pool.DruidDataSource druid: initialSize: 10 minIdle: 10 maxActive: 50 maxWait: 60000 timeBetweenEvictionRunsMillis: 60000 minEvictableIdleTimeMillis: 300000 validationQuery: SELECT 1 FROM DUAL testWhileIdle: true testOnBorrow: false testOnReturn: false poolPreparedStatements: true maxPoolPreparedStatementPerConnectionSize: 20 filters: stat,wall connectionProperties: druid.stat.mergeSql=true; druid.stat.slowSqlMillis=5000 #mybatis mybatis-plus: mapper-locations: classpath:mapper/**/*.xml typeAliasesPackage: com.hg.*.entity global-config: db-config: id-type: auto field-strategy: not_empty table-underline: true db-type: mysql refresh: true configuration: map-underscore-to-camel-case: true cache-enabled: false logging: level: org.springframework.web: info org.apache.http: info us.codecraft.webmagic: infoCopy the code

Write a POJO

package com.hg.spider.entity;

import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;

@Data
@TableName("job_info")
@Slf4j
public class JobInfo {

    @TableId
    private Long id;
    /**
     * 公司名
     */
    private String companyName;
    /** ** company address */
    private String companyAddr;
    /** ** work name */
    private String jobName;
    /** ** work address */
    private String jobAddr;
    /** * Work details */
    private String jobDetail;
    /** * salary */
    private String salary;
    /**
     * 爬取的url
     */
    private String url;
    /** ** Position release time */
    private String time;
}

Copy the code

Write the Dao

package com.hg.spider.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.hg.spider.entity.JobInfo;

/ * * *@Author skh
 * @Date 2020/3/21 16:27
 * @Desc* /
public interface JobInfoDao  extends BaseMapper<JobInfo> {}Copy the code

Write the Service

package com.hg.spider.service;

import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.hg.spider.dao.JobInfoDao;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.webmagic.JobProcessor;
import com.hg.spider.webmagic.MysqlPipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;

import java.util.List;

/ * * *@Author skh
 * @Date 2020/3/21 12:10
 * @Desc* /
@Service
@Slf4j
public class JobInfoService extends ServiceImpl<JobInfoDao.JobInfo> {

    // The url to begin the crawl
    String url = "Https://search.51job.com/list/080200, 000000000 0,26,9,99, 25 bf % 25 e6%2588% % 25 e4%25 ba % 25 a7%25 e7%25 bb % 25 ba 258 f % 25 e7% % 25 aa % 2 5 e4%25 ba ba % 25, 2, 1. HTML? Lang = c&stype = 1 & postchannel = 0000 & 99 & cotype workyear = = 99 & degreefrom = 99 & 99 & companysize jobterm = = 99 & lon lat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";

    @Autowired
    private MysqlPipeline mysqlPipeline;

    @Autowired
    private JobProcessor jobProcessor;

    public void getJobInfo(a) {
        log.info("Start crawling data.");

        // Set the crawler configuration
        Spider.create(jobProcessor)
                .addUrl(url) // Set the initial url to crawl
                // Use bloom filter to filter duplicate urls, need to import guava package
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
                .thread(50) // Set the number of threads
                .addPipeline(mysqlPipeline) // Set persistence
                .run();
    }

    public List<JobInfo> selectJobInfoByUrl(String url) {
        QueryWrapper<JobInfo> wrapper = new QueryWrapper<>();
        wrapper.eq("url", url);
        List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper);
        returnjobInfos; }}Copy the code

Writing the Controller

package com.hg.spider.controller;

import com.hg.spider.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;

/ * * *@Author skh
 * @Date2020/3/21 you *@Desc* /
@RestController
public class JobInfoController {

    @Autowired
    private JobInfoService jobInfoService;

    @GetMapping("/getJobInfo")
    public String getJobInfo(a) {
        jobInfoService.getJobInfo();
        return "success"; }}Copy the code

Implement PageProcessor, define page parsing logic

package com.hg.spider.webmagic;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

/ * * *@Author skh
 * @Date 2020/3/20 22:56
 * @DescParse the page */
@Component
@Slf4j
public class JobProcessor implements PageProcessor {

    @Autowired
    private JobInfoService jobInfoService;

    /** * parse the page *@param page
     */
    @Override
    public void process(Page page) {

        // Parse the list page
        List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes();

        if (CollUtil.isEmpty(nodes)) {
            // Blank indicates that this is the recruitment details page, parse the page, get the recruitment details information, save the data
            try {
                this.saveJobInfo(page);
            } catch (Exception e) {
                log.error("Parse exception, exception cause :{}", e.getMessage(),e); }}else {
            // If it is not empty, this is a list page. Parse out the detail page URL and place it in the task queue
            for (Selectable node : nodes) {
                // Get the URL
                String jobInfoUrl = node.css("p.t1 span a").links().toString();
                if (StrUtil.isNotBlank(jobInfoUrl)) {
                    // Check whether the record already exists
                    List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl);
                    if (CollUtil.isEmpty(jobInfoList)) {
                        // Put the URL in the task queue
                        page.addTargetRequest(jobInfoUrl);
                    } else {
                        log.info("Record exists, record URL :{}",jobInfoUrl); }}}// Get the next page URL
            List<String> all = page.getHtml().css("div.p_in li.bk a").links().all();
            String bkUrl = all.get(all.size() - 1);
            log.info("Next Url:{}", bkUrl);
            if (StrUtil.containsAny(bkUrl, "11.html")) {
                System.out.println("10 pages of data found, no need to crawl indefinitely.");
                return; } page.addTargetRequest(bkUrl); }}/** * Parse the job details page *@param page
     */
    private void saveJobInfo(Page page) {
        // Parse the page
        Html html = page.getHtml();
        String companyName = html.css("div.cn p.cname a"."text").get();
        List<String> text = html.css("div.bmsg.inbox p.fp"."text").all();
        String companyAddr = text.get(text.size() - 1);
        String jobName = html.css("div.cn h1"."text").get();
        String jobStr = html.css("p.msg.ltype"."text").get();
        String[] s = StrUtil.split(jobStr, "");
        String jobAddr = s[0];
        String time = "";
        for (String s1 : s) {
            if (StrUtil.containsAny(s1, "Release")) {
                time = StrUtil.removeAll(s1, "Release");
                break;
            }
        }
        String jonDetail = html.css("div.bmsg.job_msg.inbox"."allText").get();
        String url = page.getUrl().get();
        String salary = html.css("div.in div.cn strong"."text").get();

        JobInfo jobInfo = new JobInfo();
        jobInfo.setJobName(jobName);
        jobInfo.setJobAddr(jobAddr);
        jobInfo.setJobDetail(jonDetail);
        jobInfo.setSalary(salary);
        jobInfo.setUrl(url);
        jobInfo.setTime(time);
        jobInfo.setCompanyName(companyName);
        jobInfo.setCompanyAddr(companyAddr);

        // Save the result to resultItems for persistence
        page.putField("jobInfo", jobInfo);

    }

    // Configure crawler information
    private Site site = Site.me()
            .setUserAgent("Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
            .setCharset("gbk")
            .setTimeOut(10 * 1000)
            .setRetryTimes(3)
            .setRetrySleepTime(3000);

    @Override
    public Site getSite(a) {
        returnsite; }}Copy the code

Implementation of PipeLine, save to the database

package com.hg.spider.webmagic;

import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/ * * *@Author skh
 * @Date 2020/3/21 16:18
 * @Desc* /
@Component
@Slf4j
public class MysqlPipeline implements Pipeline
{
    @Autowired
    private JobInfoService jobInfoService;

    @Override
    public void process(ResultItems resultItems, Task task) {
        // Get the encapsulated data
        JobInfo jobInfo = resultItems.get("jobInfo");
        if(jobInfo ! =null) { jobInfoService.save(jobInfo); }}}Copy the code

test

To run the project, enter:

http://localhost:3306/getJobInfo
Copy the code

The background will start to crawl the data.

conclusion

The above is just a simple crawler case using WebMagic. Can be used as an introduction to learning.