Initialize the
Get the page information returned by the request and filter out the data we want
Initialize the projectImport dependence
The code
Write an entity class that encapsulates an object
package com.shoukailiang;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
private String img;
private String title;
private String price;
}
Copy the code
Crawl data
If you climb jingdong page, you can see that jingdong keyword can query data Find the DIV with id J_goodsList and iterate over each li below
Element element = document.getElementById("J_goodsList");
System.out.println(element);
// Get all the li elements
Elements elements = element.getElementsByTag("li");
Copy the code
ArrayList<Content> goodList = new ArrayList<>();
// Get the contents of the element
for (Element el : elements) {
// All images are lazy-loaded if not available via SRC.
// The attributes of our attr may need to be retrieved via data-lazy-img
String img = el.getElementsByTag("img").eq(0).attr("src");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setTitle(title);
content.setPrice(price);
goodList.add(content);
}
Copy the code
The complete code
package com.shoukailiang;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class HtmlParseUtil {
public List<Content> parseJD(String keywords) throws IOException {
/ / get the request at https://search.jd.com/Search?keyword=java
String url = "https://search.jd.com/Search?keyword=" + keywords;
// Parse the web page Jsoup returns Document as the browser's Document object
Document document = Jsoup.parse(new URL(url), 30000);
// All the methods that can be used in js are available here
Element element = document.getElementById("J_goodsList");
System.out.println(element);
// Get all the li elements
Elements elements = element.getElementsByTag("li");
ArrayList<Content> goodList = new ArrayList<>();
// Get the contents of the element
for (Element el : elements) {
// All images are lazy-loaded if not available via SRC.
// The attributes of our attr may need to be retrieved via data-lazy-img
String img = el.getElementsByTag("img").eq(0).attr("src");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setImg(img);
content.setTitle(title);
content.setPrice(price);
goodList.add(content);
}
return goodList;
}
public static void main(String[] args) throws IOException {
new HtmlParseUtil().parseJD("Monitor").forEach(System.out::println); }}Copy the code