The ability to search for articles by title is gone.

Although Jane book provides the function of downloading articles in batches, the articles downloaded locally are all in Markdown format and do not contain links to articles, which does not meet my needs.

Since I’m a programmer, I don’t have this feature and I’ll implement it myself.

Open the Jane book home page, found that the default display only 8 articles, slide to the bottom of the screen with the mouse, will trigger a lazy loading event, to the background to read more articles list, so the article read in the server side is to take the paging implementation.

Open Chrome Developer Tools and observe the network request. The request URL 99b8712e8850 is my simple book user ID. Page =2, 3, and 4 are the paging codes.

The content of each page is contained in the response structure in HTML format:

All I care about is the article title and article link, as shown in the highlighted field above.

I started by writing a NodeJS application with the following code:

var request = require('request');
var jsdom = require("jsdom");
var JSDOM = jsdom.JSDOM;
const PREFIX = "https://www.jianshu.com";
const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
const MAX = 2;

var mArticleResult = new Map(a);var pageNumber;
/* a given article: https://www.jianshu.com/p/963cd23fb092 value got from API: /p/5c1d0319dc42 */
var lastPageReached = false;
var url = "";

var aHandlers = [];

// use limited for loop to ease testing
for(var i = 0; i < MAX; i++){
  pageNumber = i + 1;
  var url = PAGE + pageNumber;
  // console.log("current page: " + url);
  var pageOptions = {
        url: url,
        method: "GET".headers: {
            "Accept": "text/html"}}; aHandlers.push(getArticles(pageOptions, pageNumber));if( lastPageReached)
    break;
}

console.log("promise handler size: " + aHandlers.length);

Promise.all(aHandlers).then(function(){
  var articleIndex = 0;
  for (var [key, value] of mArticleResult) {
    console.log("Article[" + articleIndex++ + "]." + key + "=" + value);
  }
  console.log("done"); });function getArticles(pageOptions, pageNumber) {
  return new Promise(function(resolve,reject){
      var requestC = request.defaults({jar: true});

      requestC(pageOptions,function(error,response,body){
        if( error){
          console.log("error: " + error);
          resolve(error);
        }
        var document = new JSDOM(body).window.document;
        var content = document.getElementsByTagName("li");

        for( var i =0; i < content.length; i++){
          var li = content[i];
          var children = li.childNodes;
          for( var j = 0; j < children.length; j++){
              var eachChild = children[j];
              if( eachChild.nodeName == "DIV") {var grandChild = eachChild.childNodes;
                for( var k = 0; k < grandChild.length; k++){
                  var grand = grandChild[k];
                  if( grand.nodeName == "A") {var fragment = grand.getAttribute("href");
                    if( fragment.indexOf("/p") < 0)
                      continue;
                    console.log("title: " + grand.text);
                    var wholeURL = PREFIX + fragment;
                    console.log("url: " + wholeURL);
                    if( mArticleResult.has(grand.text)){
                      lastPageReached = true;
                      console.log("article size: " + mArticleResult.size);
                      resolve(pageNumber);
                    }
                    mArticleResult.set(grand.text, wholeURL);
                  }
                }
              }
          }
        }// end of outer loop
        resolve(pageNumber);
      }); 
     });
}
Copy the code

The principle is to use nodeJS request Module to make multiple simultaneous requests to Jane’s website, each request reads a page of Jane’s article.

It turns out that this method doesn’t work when the number of concurrent requests is greater than 10, and Jane book rejects such requests, returning HTTP 429 status codes.

So I ended up using the simplest synchronous request implementation, using sync-Request provided by NodeJS to initiate the request in a loop.

var request = require("sync-request");
var jsdom = require("jsdom");
var JSDOM = jsdom.JSDOM;
var textEncoding = require('text-encoding'); 
var textDecoder = textEncoding.TextDecoder;

const PREFIX = "https://www.jianshu.com";
const PAGE = "https://www.jianshu.com/u/99b8712e8850?order_by=shared_at&page=";
const MAX = 100;

var mArticleResult = new Map(a);var lastPageReached = false;
var pageNumber;
/* a given article: https://www.jianshu.com/p/963cd23fb092 value got from API: /p/5c1d0319dc42 */

try {
    // use limited for loop to ease testing
    for (var i = 0; i < MAX; i++) {
        if( lastPageReached)
          break;
        pageNumber = i + 1;
        var url = PAGE + pageNumber;
        console.log("current page: " + url);
        var response = request('GET', url);
        var html = new textDecoder("utf-8").decode(response.body); handleResponseHTML(html); }}catch (e) {

}

var articleIndex = 0;
var resultHTML = "<html>";

const fs = require('fs');

/*  

eee

22

33

*/
var index = 1; for (var [key, value] of mArticleResult) { var article = "<p><a href=\"" + key + "\" >" + index++ + "." + value + "</a></p>" + "\n"; resultHTML = resultHTML + article; console.log("Article[" + articleIndex++ + "]." + value + "=" + key); } resultHTML = resultHTML + "</html>"; var pwd = process.cwd() + "/jianshu.html"; fs.appendFileSync(pwd, resultHTML); console.log("done"); function handleResponseHTML(html) { var document = new JSDOM(html).window.document; var content = document.getElementsByTagName("li"); for (var i = 0; i < content.length; i++) { var li = content[i]; var children = li.childNodes; for (var j = 0; j < children.length; j++) { var eachChild = children[j]; if (eachChild.nodeName == "DIV") { var grandChild = eachChild.childNodes; for (var k = 0; k < grandChild.length; k++) { var grand = grandChild[k]; if (grand.nodeName == "A") { var fragment = grand.getAttribute("href"); if (fragment.indexOf("/p") < 0) continue; // console.log("title: " + grand.text); var wholeURL = PREFIX + fragment; // console.log("url: " + wholeURL); if (mArticleResult.has(wholeURL)) { lastPageReached = true; console.log("article size: " + mArticleResult.size); return; } mArticleResult.set(wholeURL, grand.text); } } } } } } Copy the code

When executed, the NodeJS application generates an HTML file locally containing the title and hyperlinks for each article.

For more of Jerry’s original articles, please follow the public account “Wang Zixi “: