[toc]
Case description
Select SpringBoot, Mybatis plus, Webmagic from 51job and save it to mysql database.
Create a project
Introducing Maven dependencies
<? The XML version = "1.0" encoding = "utf-8"? > < project XMLNS = "http://maven.apache.org/POM/4.0.0" XMLNS: xsi = "http://www.w3.org/2001/XMLSchema-instance" Xsi: schemaLocation = "http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd" > < modelVersion > 4.0.0 < / modelVersion > < the parent > < groupId > org. Springframework. Boot < / groupId > The < artifactId > spring - the boot - starter - parent < / artifactId > < version > 2.2.5. RELEASE < / version > < relativePath / > <! -- lookup parent from repository --> </parent> <groupId>com.hg</groupId> <artifactId>spider-demo</artifactId> <version>0.0.1-SNAPSHOT</version> <name>spider-demo</name> <description> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>cn. Hutool </groupId> <artifactId>hutool-all</artifactId> <version>5.1.0</version> </dependency> <! -- Druid database connection pool --> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> The < version > 1.1.10 < / version > < / dependency > <! -- mysql connector --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <! -- Mybatis-plus --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> The < version > 3.0.5 < / version > < / dependency > <! --webmagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> < version > 0.7.3 < / version > < / dependency > < the dependency > < groupId > us. Codecraft < / groupId > < artifactId > webmagic - the extension < / artifactId > < version > 0.7.3 < / version > < / dependency > < the dependency > <groupId>com.google.guava</groupId> <artifactId> <version>16.0</version> </dependency> </dependencies> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>Copy the code
Build table statements
Create a database spider and create a new table job_info
CREATE TABLE `job_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'primary key id',
`company_name` varchar(100) DEFAULT NULL COMMENT 'Company Name',
`company_addr` varchar(200) DEFAULT NULL COMMENT 'Company Contact information',
`job_name` varchar(100) DEFAULT NULL COMMENT 'Job Title',
`job_addr` varchar(50) DEFAULT NULL COMMENT 'Place of Work',
`salary` varchar(50) DEFAULT NULL COMMENT 'Salary range',
`url` varchar(150) DEFAULT NULL COMMENT 'Recruitment Details Page',
`time` varchar(10) DEFAULT NULL COMMENT 'Last Post time',
`job_detail` text COMMENT 'Job Details'.PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='Wanted Information';
Copy the code
Adding a Configuration File
Create application. Yml
spring: application: name: spider-servoce jackson: time-zone: GMT+8 date-format: yyyy-MM-dd HH:mm:ss datasource: driver-class-name: com.mysql.cj.jdbc.Driver url: jdbc:mysql://localhost:3306/spider? useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false username: root password: root type: com.alibaba.druid.pool.DruidDataSource druid: initialSize: 10 minIdle: 10 maxActive: 50 maxWait: 60000 timeBetweenEvictionRunsMillis: 60000 minEvictableIdleTimeMillis: 300000 validationQuery: SELECT 1 FROM DUAL testWhileIdle: true testOnBorrow: false testOnReturn: false poolPreparedStatements: true maxPoolPreparedStatementPerConnectionSize: 20 filters: stat,wall connectionProperties: druid.stat.mergeSql=true; druid.stat.slowSqlMillis=5000 #mybatis mybatis-plus: mapper-locations: classpath:mapper/**/*.xml typeAliasesPackage: com.hg.*.entity global-config: db-config: id-type: auto field-strategy: not_empty table-underline: true db-type: mysql refresh: true configuration: map-underscore-to-camel-case: true cache-enabled: false logging: level: org.springframework.web: info org.apache.http: info us.codecraft.webmagic: infoCopy the code
Write a POJO
package com.hg.spider.entity;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Data
@TableName("job_info")
@Slf4j
public class JobInfo {
@TableId
private Long id;
/**
* 公司名
*/
private String companyName;
/** ** company address */
private String companyAddr;
/** ** work name */
private String jobName;
/** ** work address */
private String jobAddr;
/** * Work details */
private String jobDetail;
/** * salary */
private String salary;
/**
* 爬取的url
*/
private String url;
/** ** Position release time */
private String time;
}
Copy the code
Write the Dao
package com.hg.spider.dao;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.hg.spider.entity.JobInfo;
/ * * *@Author skh
* @Date 2020/3/21 16:27
* @Desc* /
public interface JobInfoDao extends BaseMapper<JobInfo> {}Copy the code
Write the Service
package com.hg.spider.service;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.hg.spider.dao.JobInfoDao;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.webmagic.JobProcessor;
import com.hg.spider.webmagic.MysqlPipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import java.util.List;
/ * * *@Author skh
* @Date 2020/3/21 12:10
* @Desc* /
@Service
@Slf4j
public class JobInfoService extends ServiceImpl<JobInfoDao.JobInfo> {
// The url to begin the crawl
String url = "Https://search.51job.com/list/080200, 000000000 0,26,9,99, 25 bf % 25 e6%2588% % 25 e4%25 ba % 25 a7%25 e7%25 bb % 25 ba 258 f % 25 e7% % 25 aa % 2 5 e4%25 ba ba % 25, 2, 1. HTML? Lang = c&stype = 1 & postchannel = 0000 & 99 & cotype workyear = = 99 & degreefrom = 99 & 99 & companysize jobterm = = 99 & lon lat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
@Autowired
private MysqlPipeline mysqlPipeline;
@Autowired
private JobProcessor jobProcessor;
public void getJobInfo(a) {
log.info("Start crawling data.");
// Set the crawler configuration
Spider.create(jobProcessor)
.addUrl(url) // Set the initial url to crawl
// Use bloom filter to filter duplicate urls, need to import guava package
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(50) // Set the number of threads
.addPipeline(mysqlPipeline) // Set persistence
.run();
}
public List<JobInfo> selectJobInfoByUrl(String url) {
QueryWrapper<JobInfo> wrapper = new QueryWrapper<>();
wrapper.eq("url", url);
List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper);
returnjobInfos; }}Copy the code
Writing the Controller
package com.hg.spider.controller;
import com.hg.spider.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
/ * * *@Author skh
* @Date2020/3/21 you *@Desc* /
@RestController
public class JobInfoController {
@Autowired
private JobInfoService jobInfoService;
@GetMapping("/getJobInfo")
public String getJobInfo(a) {
jobInfoService.getJobInfo();
return "success"; }}Copy the code
Implement PageProcessor, define page parsing logic
package com.hg.spider.webmagic;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/ * * *@Author skh
* @Date 2020/3/20 22:56
* @DescParse the page */
@Component
@Slf4j
public class JobProcessor implements PageProcessor {
@Autowired
private JobInfoService jobInfoService;
/** * parse the page *@param page
*/
@Override
public void process(Page page) {
// Parse the list page
List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes();
if (CollUtil.isEmpty(nodes)) {
// Blank indicates that this is the recruitment details page, parse the page, get the recruitment details information, save the data
try {
this.saveJobInfo(page);
} catch (Exception e) {
log.error("Parse exception, exception cause :{}", e.getMessage(),e); }}else {
// If it is not empty, this is a list page. Parse out the detail page URL and place it in the task queue
for (Selectable node : nodes) {
// Get the URL
String jobInfoUrl = node.css("p.t1 span a").links().toString();
if (StrUtil.isNotBlank(jobInfoUrl)) {
// Check whether the record already exists
List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl);
if (CollUtil.isEmpty(jobInfoList)) {
// Put the URL in the task queue
page.addTargetRequest(jobInfoUrl);
} else {
log.info("Record exists, record URL :{}",jobInfoUrl); }}}// Get the next page URL
List<String> all = page.getHtml().css("div.p_in li.bk a").links().all();
String bkUrl = all.get(all.size() - 1);
log.info("Next Url:{}", bkUrl);
if (StrUtil.containsAny(bkUrl, "11.html")) {
System.out.println("10 pages of data found, no need to crawl indefinitely.");
return; } page.addTargetRequest(bkUrl); }}/** * Parse the job details page *@param page
*/
private void saveJobInfo(Page page) {
// Parse the page
Html html = page.getHtml();
String companyName = html.css("div.cn p.cname a"."text").get();
List<String> text = html.css("div.bmsg.inbox p.fp"."text").all();
String companyAddr = text.get(text.size() - 1);
String jobName = html.css("div.cn h1"."text").get();
String jobStr = html.css("p.msg.ltype"."text").get();
String[] s = StrUtil.split(jobStr, "");
String jobAddr = s[0];
String time = "";
for (String s1 : s) {
if (StrUtil.containsAny(s1, "Release")) {
time = StrUtil.removeAll(s1, "Release");
break;
}
}
String jonDetail = html.css("div.bmsg.job_msg.inbox"."allText").get();
String url = page.getUrl().get();
String salary = html.css("div.in div.cn strong"."text").get();
JobInfo jobInfo = new JobInfo();
jobInfo.setJobName(jobName);
jobInfo.setJobAddr(jobAddr);
jobInfo.setJobDetail(jonDetail);
jobInfo.setSalary(salary);
jobInfo.setUrl(url);
jobInfo.setTime(time);
jobInfo.setCompanyName(companyName);
jobInfo.setCompanyAddr(companyAddr);
// Save the result to resultItems for persistence
page.putField("jobInfo", jobInfo);
}
// Configure crawler information
private Site site = Site.me()
.setUserAgent("Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
.setCharset("gbk")
.setTimeOut(10 * 1000)
.setRetryTimes(3)
.setRetrySleepTime(3000);
@Override
public Site getSite(a) {
returnsite; }}Copy the code
Implementation of PipeLine, save to the database
package com.hg.spider.webmagic;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/ * * *@Author skh
* @Date 2020/3/21 16:18
* @Desc* /
@Component
@Slf4j
public class MysqlPipeline implements Pipeline
{
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
// Get the encapsulated data
JobInfo jobInfo = resultItems.get("jobInfo");
if(jobInfo ! =null) { jobInfoService.save(jobInfo); }}}Copy the code
test
To run the project, enter:
http://localhost:3306/getJobInfo
Copy the code
The background will start to crawl the data.
conclusion
The above is just a simple crawler case using WebMagic. Can be used as an introduction to learning.