Environment set up
Create a SpringBoot project.
configuration
<properties>
<java.version>1.8</java.version>
<! -- Customize a es version dependency to ensure that it is consistent with local -->
<elasticsearch.version>7.6.2</elasticsearch.version>
</properties>
<dependencies>
<! Parse web pages. Can only parse web pages -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<! --fastJson-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<! --Elasticsearch-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
Copy the code
## Disable the Thymeleaf cache
spring.thymeleaf.cache=false
Copy the code
Import page information. This has a Baidu Cloud link in the Elasticsearch overview
controller
@Controller
public class IndexController {
@RequestMapping({"/","/index"})
public String index(a){
return "index"; }}Copy the code
~ HTTP :localhost:8080/
Jsoup parsing
<! Parse web pages. Can only parse web pages -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
Copy the code
Create the utils package and create the HtmlParseUtil class
By analyzing its website, you should be able to understand the following code
@Component// Use @autoWired injection. Use new it instead of handing it over to Spring
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
// Get the request. Need to connected to the Internet
String url = "https://search.jd.com/Search?keyword=java";
// Parse the page. The Document object returned by Jsoup is the browser's Document object
Document document = Jsoup.parse(new URL(url), 30000);
// All the operations that Document can do in js can be performed in the next operation
Element element = document.getElementById("J_goodsList");
System.out.println(element.html());// Print the HTML source code under the J_goodList tag
// Get all the li elements
Elements elements = element.getElementsByTag("li");
// Get all the contents of the element
for (Element e1 : elements) {
// Image lazy loading
String img = e1.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = e1.getElementsByClass("p-price").eq(0).text();
String title = e1.getElementsByClass("p-name").eq(0).text();
System.out.println("= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ="); System.out.println(img); System.out.println(price); System.out.println(title); }}}Copy the code
The corresponding information is successfully obtained. Procedure It is then encapsulated. Start by creating a POJO Content object
Then encapsulate it into a parseJD method.
@Component// Use @autoWired injection. Use new it instead of handing it over to Spring
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
new HtmlParseUtil().parseJD("Vue").forEach(System.out::println);
}
public ArrayList<Content> parseJD(String keywords) throws IOException {
// Get the request. Need to connected to the Internet
String url = "https://search.jd.com/Search?keyword=" + keywords;
System.out.println(url);
// Parse the page. The Document object returned by Jsoup is the browser's Document object
Document document = Jsoup.parse(new URL(url), 30000);
// All the operations that Document can do in js can be performed in the next operation
Element element = document.getElementById("J_goodsList");
//System.out.println(element.html());
// Get all the li elements
Elements elements = element.getElementsByTag("li");
ArrayList<Content> goodsList = new ArrayList<>();
// Get all the contents of the element
for (Element e1 : elements) {
String img = e1.getElementsByTag("img").eq(0).attr("data-lazy-img");
String price = e1.getElementsByClass("p-price").eq(0).text();
String title = e1.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setTitle(title);
content.setPrice(price);
goodsList.add(content);
}
returngoodsList; }}Copy the code
Business writing
Elastcisearch’s configuration classes are still configured.
@Configuration
public class ElasticSearchClientConfig {
@Bean
public RestHighLevelClient restHighLevelClient(a) {
RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(
new HttpHost("localhost".9200."http")));
returnclient; }}Copy the code
Write the Service business class
@Service
public class ContentService {
@Autowired
RestHighLevelClient restHighLevelClient;
//1. Place parsed data into es index
public Boolean parseContent(String keywords) throws IOException {
ArrayList<Content> contents = new HtmlParseUtil().parseJD(keywords);
// Put the query data into es
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");// The expiration time is two minutes
for (int i = 0; i < contents.size(); i++){ System.out.println(JSON.toJSONString(contents.get(i))); bulkRequest.add(new IndexRequest("jd_goods").source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return! bulk.hasFailures();// Returns whether the insertion was successful}}Copy the code
Write the controoler
@RestController
public class ContentController {
@Autowired
ContentService contentService;
@GetMapping("/parse/{keyword}")
public Boolean parse(@PathVariable("keyword") String keyword) throws IOException {
Boolean result = contentService.parseContent(keyword);
returnresult; }}Copy the code
Test http://localhost:8080/parse/java to start the project
The related Javas commodity information was successfully added.
Then we continued to write the Service, adding a paging search for data in ES.
@Service
public class ContentService {
@Autowired
RestHighLevelClient restHighLevelClient;
//1. Place parsed data into es index
public Boolean parseContent(String keywords) throws IOException {
ArrayList<Content> contents = new HtmlParseUtil().parseJD(keywords);
// Put the query data into es
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");// The expiration time is two minutes
for (int i = 0; i < contents.size(); i++){ System.out.println(JSON.toJSONString(contents.get(i))); bulkRequest.add(new IndexRequest("jd_goods").source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return! bulk.hasFailures();// Returns whether the insertion was successful
}
//2. Obtain these data for search function
public List<Map<String,Object>> searchPage(String keyword,int pageNo,int pageSize) throws IOException {
if (pageNo<=1){
pageNo = 1;
}
// Conditional search
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
/ / paging
sourceBuilder.from(pageNo);// Start data
sourceBuilder.size(pageSize);// Page size
// Precisely match the keyword
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));// Timeout control
// Perform a search
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
ArrayList<Map<String,Object>> list = new ArrayList<>();
SearchHit[] hits = searchResponse.getHits().getHits();// Get the hits array object
for (SearchHit documentFields : hits){
list.add(documentFields.getSourceAsMap());// Add to list
}
returnlist; }}Copy the code
Then add a request to the Controller
@RestController
public class ContentController {
@Autowired
ContentService contentService;
@GetMapping("/parse/{keyword}")
public Boolean parse(@PathVariable("keyword") String keyword) throws IOException {
Boolean result = contentService.parseContent(keyword);
return result;
}
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,
@PathVariable("pageNo") int pageNo,
@PathVariable("pageSize") int pageSize) throws IOException {
List<Map<String, Object>> list = contentService.searchPage(keyword, pageNo, pageSize);
returnlist; }}Copy the code
Test http://localhost:8080/search/java/1/20 to start the project
The front page
Import Vue and Axios, I’m using the online version here
< script SRC = "https://cdn.staticfile.org/vue/2.6.2/vue.min.js" > < / script >
<script src="https://unpkg.com/axios/dist/axios.min.js"></script>
Modify our Index page.
Start the project to see the effect. (I have parsed the VUE data and added it to ES)
Highlighting function
Let’s modify the code in the business class Service.
Add the highlighted field substitution to the title in _source
@Service
public class ContentService {
@Autowired
RestHighLevelClient restHighLevelClient;
//1. Place parsed data into es index
public Boolean parseContent(String keywords) throws IOException {
ArrayList<Content> contents = new HtmlParseUtil().parseJD(keywords);
// Put the query data into es
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");// The expiration time is two minutes
for (int i = 0; i < contents.size(); i++){ System.out.println(JSON.toJSONString(contents.get(i))); bulkRequest.add(new IndexRequest("jd_goods").source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return! bulk.hasFailures();// Returns whether the insertion was successful
}
//2. Obtain these data for search function
public List<Map<String,Object>> searchPage(String keyword,int pageNo,int pageSize) throws IOException {
if (pageNo<=1){
pageNo = 1;
}
// Conditional search
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
/ / paging
sourceBuilder.from(pageNo);// Start data
sourceBuilder.size(pageSize);// Page size
// Precisely match the keyword
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));// Timeout control
/ / highlight
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("title");// Highlight the field
highlightBuilder.requireFieldMatch(false);// Turn off multiple highlights. For example, if there are multiple Vues in the title, highlight only one
highlightBuilder.preTags("<span style='color:red'>");// Pre-label
highlightBuilder.postTags("</span>");// Back label
sourceBuilder.highlighter(highlightBuilder);// Add highlight
// Perform a search
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
ArrayList<Map<String,Object>> list = new ArrayList<>();
SearchHit[] hits = searchResponse.getHits().getHits();// Get the hits array object
for (SearchHit hit : hits){
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
Map<String, Object> sourceAsMap = hit.getSourceAsMap();// The original result
HighlightField title = highlightFields.get("title");
// Parse the highlighted field, replacing the original field with the highlighted field
if(title! =null){
Text[] fragments = title.fragments();
String hTitle = "";
for (Text text : fragments) {
hTitle += text;
}
sourceAsMap.put("title",hTitle);// Replace the highlighted field with the original content
}
list.add(sourceAsMap);
}
returnlist; }}Copy the code
<p class="productTitle">
<a v-html="result.title"> </a>
</p>
Copy the code
Restart the service and access the test. http://localhost:8080/
Done!