Preview result view
directory
Preview result view
introduce
code
Rely on
Crawl web data code
Parsing code
Parsing is introduced
The complete code
introduce
1. The crawl is implemented via org.jsoup and HttpClients
2. Perform a loop when crawling multiple pages
3. Crawl and parse the data to jsonouP
4. Retrieve data. Save the data in a file to the local PC
5. File into excel can produce test my another article yushen.blog.csdn.net/article/det…
6. Finally, just use the full code
code
Rely on
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import com.alibaba.fastjson.JSONArray;
Copy the code
Call loop code
/** * Climb task 001 Climb XX data * Climb xx data 100 pages ** @param URL * @throws InterruptedException */ public static void CrawlingTask001 () throws InterruptedException {/ / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- agent task start -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - / / List<List<String>> list0 = new ArrayList<List<String>>(); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- crawl, task -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "crawl, task execution start!" ); For (int I = 1; i < 2; i++) { Thread.sleep(500); String url = "https://www.xx.com/sf/vsearch?pd=igI&rsv_bp=1&f=8&async=1&pn="+ i; List<String> list = crawlingData(url); list0.add(list); } system.out.println (new Date() +" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- read crawl data, tasks -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "read crawl data, Mission begin!" ); JSONArray ja = new JSONArray(); For (int I = 0; i < list0.size(); i++) { List<String> list01 = list0.get(i); System.out.println(" + (I + 1) + "); list01.size() +" ); for (int j = 0; j < list01.size(); Println (j ++) {system.out.println (j + "" + list01.get(j)); ja.add(list01.get(j)); // TODO 1. Read each page 2. }} system.out.println (new Date() +" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- data stored in persistent disk, task -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "data in the persistent disk, Mission begin!" ); spider(ja); System.out.println(new Date() + "Save data to persistent disk, complete!" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- end of task agents -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -}Copy the code
Crawl web data code
// Get the HTML content of the corresponding page based on the URL. We packaged the content from the previous section into a method, Public static String getHTMLContent(String URL) throws IOException {// Establish a new request client CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet HttpGet = new HttpGet(url); HttpGet. SetHeader (" the user-agent ", "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; The rv: 50.0) Gecko / 20100101 Firefox / 50.0 "); CloseableHttpResponse Response = httpClient.execute(httpGet); // Get the returned entity HttpEntity entity = response.getentity (); String content = EntityUtils.toString(entity); // Close HttpEntity stream entityutils. consume(entity); return content; }Copy the code
Parsing code
Public static List<String> crawlingData(String url) {String rawHTML = null; try { rawHTML = getHTMLContent(url); } catch (IOException e) { e.printStackTrace(); System.out.println(e.toString()); } // system.out. println(" grab webpage: "+rawHTML); Parse (rawHTML); // Convert the current page to Jsoup Document object Document doc = Jsoup. Elements blogList = doc.select("p[class=content]"); Elements blogList = doc.select("div[class=video_list video_short]"); // system.out. println(" Parse list: "+rawHTML); List<String> list = new ArrayList<String>(); // Parses each blog content and prints for (Element Element: blogList) { String href = element.select("a[class=small_img_con border-radius]").attr("href"); String vidoeTime = element.select("div[class=time_con]").select("span[class=video_play_timer]").text(); String name = element.select("div[class=video_small_intro]").select("a[class=video-title c-link]").text(); String ly = element.select("div[class=c-color-gray2 video-no-font]").select("span[class=wetSource c-font-normal]").text(); String uploadtime = element.select("div[class=c-color-gray2 video-no-font]").select("span[class=c-font-normal]").text(); list.add(href + ",__," + name + ",__," + uploadtime + ",__," + vidoeTime + ",__," + href + ",__," +href + ",__," + ly); // System.out.println(href + ",__," + name+ ",__," + datetime+ ",__," + uploadname+ ",__," + uploadnamemasterurl); } return list; }Copy the code
Parsing is introduced
1. The element. Select (“div[class=time_con]”) is the same as jquery in JS
2. Parent and child, you can use select xx
Attr (“xx”) or text()
4. Maven compliance of JsonUp and POI is not affected by the release
The complete code
package com.superman.test; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import com.alibaba.fastjson.JSONArray; /** ** select ** * from ** ** ** */ public class ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** DataLearnerCrawlerSPWZv2 { public static void main(String[] args) throws InterruptedException { crawlingTask001(); } /** * Climb task 001 Climb XX data * Climb XXXX page 100 data ** @param URL * @throws InterruptedException */ public static void CrawlingTask001 () throws InterruptedException {/ / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- agent task start -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - / / List<List<String>> list0 = new ArrayList<List<String>>(); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- crawl, task -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "crawl, task execution start!" ); For (int I = 1; i < 2; i++) { Thread.sleep(500); String url = "https://www.xx.com/sf/vsearch?pd=video&tn=vsearch&lid=b3581e7000011f50&ie=utf-8&rsv_pq=b3581e7000011f50&wdxhJiHsELdcqFi gI&rsv_bp=1&f=8&async=1&pn="+ i; List<String> list = crawlingData(url); list0.add(list); } system.out.println (new Date() +" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- read crawl data, tasks -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "read crawl data, Mission begin!" ); JSONArray ja = new JSONArray(); For (int I = 0; i < list0.size(); i++) { List<String> list01 = list0.get(i); System.out.println(" + (I + 1) + "); list01.size() +" ); for (int j = 0; j < list01.size(); Println (j ++) {system.out.println (j + "" + list01.get(j)); ja.add(list01.get(j)); // TODO 1. Read each page 2. }} system.out.println (new Date() +" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- data stored in persistent disk, task -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- System. Out.println (new Date () + "data in the persistent disk, Mission begin!" ); spider(ja); System.out.println(new Date() + "Save data to persistent disk, complete!" ); / / -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- end task agents -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -} / * * * crawl data to obtain the url * * @ url param * / public static List<String> crawlingData(String url) { String rawHTML = null; try { rawHTML = getHTMLContent(url); } catch (IOException e) { e.printStackTrace(); System.out.println(e.toString()); } // system.out. println(" grab webpage: "+rawHTML); Parse (rawHTML); // Convert the current page to Jsoup Document object Document doc = Jsoup. Elements blogList = doc.select("p[class=content]"); Elements blogList = doc.select("div[class=video_list video_short]"); // system.out. println(" Parse list: "+rawHTML); List<String> list = new ArrayList<String>(); // Parses each blog content and prints for (Element Element: blogList) { String href = element.select("a[class=small_img_con border-radius]").attr("href"); String vidoeTime = element.select("div[class=time_con]").select("span[class=video_play_timer]").text(); String name = element.select("div[class=video_small_intro]").select("a[class=video-title c-link]").text(); String ly = element.select("div[class=c-color-gray2 video-no-font]").select("span[class=wetSource c-font-normal]").text(); String uploadtime = element.select("div[class=c-color-gray2 video-no-font]").select("span[class=c-font-normal]").text(); list.add(href + ",__," + name + ",__," + uploadtime + ",__," + vidoeTime + ",__," + href + ",__," +href + ",__," + ly); // System.out.println(href + ",__," + name+ ",__," + datetime+ ",__," + uploadname+ ",__," + uploadnamemasterurl); } return list; } // Get the HTML content of the corresponding page based on the URL. We packaged the content from the previous section into a method, Public static String getHTMLContent(String URL) throws IOException {// Establish a new request client CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet HttpGet = new HttpGet(url); HttpGet. SetHeader (" the user-agent ", "Mozilla / 5.0 (Windows NT 6.1; Win64; x64; The rv: 50.0) Gecko / 20100101 Firefox / 50.0 "); CloseableHttpResponse Response = httpClient.execute(httpGet); // Get the returned entity HttpEntity entity = response.getentity (); String content = EntityUtils.toString(entity); // Close HttpEntity stream entityutils. consume(entity); return content; }}Copy the code
Many places in the article changed into XX, when using their own to climb the site information can be OK
Need to leave a message at any time, chatter ^_^
ok
Continuously updated