Author: Ji Zhi

ParserBlock

As we discussed in the last ParserCore article, we first call normalize the rule function to unify the newline characters of Linux(“\n”) and Windows(“\r\n”) into “\n”. We then go to the parseBlock. parse process. This step mainly produces tokens with block true. As shown below:

module.exports = function block(state) {
  var token;

  if (state.inlineMode) {
    token          = new state.Token('inline'.' '.0);
    token.content  = state.src;
    token.map      = [ 0.1 ];
    token.children = [];
    state.tokens.push(token);
  } else{ state.md.block.parse(state.src, state.md, state.env, state.tokens); }};Copy the code

The parse function takes four arguments:

  1. State.src represents the string passed in by the user
  2. State. md refers to the current MD instance, mainly for the convenience of obtaining properties and methods on the MD
  3. State. env is some extra data injected in the call to Md.parse, which is the default{}You won’t need it unless you’re doing some custom development
  4. Tokens references. Note: You cannot change tokens references inside the rule function. You must ensure that all rule functions manipulate the same tokens.

Let’s focus on the logic inside ParserBlock. Located in the lib/parser_block. Js.

var _rules = [
  [ 'table'.require('./rules_block/table'),'paragraph'.'reference']], ['code'.require('./rules_block/code')], ['fence'.require('./rules_block/fence'),'paragraph'.'reference'.'blockquote'.'list']], ['blockquote'.require('./rules_block/blockquote'),'paragraph'.'reference'.'blockquote'.'list']], ['hr'.require('./rules_block/hr'),'paragraph'.'reference'.'blockquote'.'list']], ['list'.require('./rules_block/list'),'paragraph'.'reference'.'blockquote']], ['reference'.require('./rules_block/reference')], ['heading'.require('./rules_block/heading'),'paragraph'.'reference'.'blockquote']], ['lheading'.require('./rules_block/lheading')], ['html_block'.require('./rules_block/html_block'),'paragraph'.'reference'.'blockquote']], ['paragraph'.require('./rules_block/paragraph')]];function ParserBlock() {

  this.ruler = new Ruler();

  for (var i = 0; i < _rules.length; i++) {
    this.ruler.push(_rules[i][0], _rules[i][1] and {alt: (_rules[i][2] || []).slice() });
  }
}

ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
  var ok, i,
      rules = this.ruler.getRules(' '),
      len = rules.length,
      line = startLine,
      hasEmptyLines = false,
      maxNesting = state.md.options.maxNesting;

  while (line < endLine) {
    state.line = line = state.skipEmptyLines(line);
    if (line >= endLine) { break; }

    if (state.sCount[line] < state.blkIndent) { break; }

    if (state.level >= maxNesting) {
      state.line = endLine;
      break;
    }
    for (i = 0; i < len; i++) {
      ok = rules[i](state, line, endLine, false);
      if (ok) { break; } } state.tight = ! hasEmptyLines;if (state.isEmpty(state.line - 1)) {
      hasEmptyLines = true;
    }

    line = state.line;

    if (line < endLine && state.isEmpty(line)) {
      hasEmptyLines = true; line++; state.line = line; }}}; ParserBlock.prototype.parse =function (src, md, env, outTokens) {
  var state;

  if(! src) {return; }

  state = new this.State(src, md, env, outTokens);

  this.tokenize(state, state.line, state.lineMax);
};


ParserBlock.prototype.State = require('./rules_block/state_block');
Copy the code

As you can see from the constructor, ParserBlock has 11 kinds of rules, Table, code, fence, blockquote, HR, list, reference, heading, lheading, HTML_block, and paragraph respectively. After going through the rules chain consisting of these rules, tokens of the corresponding type are exported. That’s where parserBlocks come in. Ruler manages all rules and the chain to which the rule belongs.

for (var i = 0; i < _rules.length; i++) {
  this.ruler.push(_rules[i][0], _rules[i][1] and {alt: (_rules[i][2] || []).slice() });
}
Copy the code

_rules is a two-dimensional array whose elements are also an array, called ruleConfig for now. The first element of ruleConfig is the name of the rule. The second is fn of rule, and the third is Alt of rule, which is the responsibility chain to which it belongs. If Alt is [‘paragraph’, ‘reference’], then the call to ruler. GetRules (‘ Paragraph ‘) will return [fn], Calls to ruler. GetRules (‘reference’) also return [fn], because fn’s Alt array contains both chains of responsibility.

Look again at the parse method.

ParserBlock.prototype.parse = function (src, md, env, outTokens) {
  var state;

  if(! src) {return; }

  state = new this.State(src, md, env, outTokens);

  this.tokenize(state, state.line, state.lineMax);
};
ParserBlock.prototype.State = require('./rules_block/state_block');
Copy the code

So ParserBlock State, remember the State of ParserCore? Each Parser has a State instance that manages its State within parse. The State of ParserBlock is located in lib/rules_block/state_block.js.

function StateBlock(src, md, env, tokens) {
  var ch, s, start, pos, len, indent, offset, indent_found;
  this.src = src;
  this.md     = md;

  this.env = env;

  this.tokens = tokens

  this.bMarks = []
  this.eMarks = []
  this.tShift = []
  this.sCount = []

  this.bsCount = []

  this.blkIndent  = 0

  this.line       = 0
  this.lineMax    = 0
  this.tight      = false
  this.ddIndent   = - 1
  this.parentType = 'root'

  this.level = 0

  this.result = ' '
  s = this.src
  indent_found = false

  for (start = pos = indent = offset = 0, len = s.length; pos < len; pos++) {
    ch = s.charCodeAt(pos);

    if(! indent_found) {if (isSpace(ch)) {
        indent++;

        if (ch === 0x09) {
          offset += 4 - offset % 4;
        } else {
          offset++;
        }
        continue;
      } else {
        indent_found = true; }}if (ch === 0x0A || pos === len - 1) {
      if(ch ! = =0x0A) { pos++; }
      this.bMarks.push(start);
      this.eMarks.push(pos);
      this.tShift.push(indent);
      this.sCount.push(offset);
      this.bsCount.push(0);

      indent_found = false;
      indent = 0;
      offset = 0;
      start = pos + 1; }}this.bMarks.push(s.length);
  this.eMarks.push(s.length);
  this.tShift.push(0);
  this.sCount.push(0);
  this.bsCount.push(0);

  this.lineMax = this.bMarks.length - 1;
}
Copy the code

It is critical to understand the role of the properties on State. Because these attributes are the information that tokenize will depend on next. Focus on the following attributes:

  • tokens

    An array of tokens after tokenize

  • bMarks

    Stores the starting position of each line, as Parse’s procedure scans line by line based on newlines

  • eMarks

    Stores the end location of each row

  • tShift

    Store the position of the first non-space character on each line (TAB length counts as 1 only)

  • sCount

    Store the position of the first non-space string on each line (TAB length 4)

  • bsCount

    General to 0

  • blkIndent

    General to 0

  • line

    Number of current rows. Tokenize is used for line by line scanning

  • lineMax

    How many rows is SRC split into

These are all very useful attributes in the Tokenize process. Take a look at the tokenize process, which generates the token with block true.

ParserBlock.prototype.tokenize = function (state, startLine, endLine) {
  var ok, i,
      rules = this.ruler.getRules(' '),
      len = rules.length,
      line = startLine,
      hasEmptyLines = false,
      maxNesting = state.md.options.maxNesting;

  while (line < endLine) {
    state.line = line = state.skipEmptyLines(line);
    if (line >= endLine) { break; }

    if (state.sCount[line] < state.blkIndent) { break; }

    if (state.level >= maxNesting) {
      state.line = endLine;
      break;
    }

    for (i = 0; i < len; i++) {
      ok = rules[i](state, line, endLine, false);
      if (ok) { break; } } state.tight = ! hasEmptyLines;if (state.isEmpty(state.line - 1)) {
      hasEmptyLines = true;
    }

    line = state.line;

    if (line < endLine && state.isEmpty(line)) {
      hasEmptyLines = true; line++; state.line = line; }}}Copy the code

The execution flow of the function is as follows:

  1. Returns all rule functions declared by the ParserBlock constructor, as specified in the Ruler class that the internal rule function must belong to the rule chain with the empty string name. Of course constructors have many other rule chains. Paragraph, Reference, blockquote, and list are not used yet. At the same time, many initial variables are declared.

  2. It then goes through a while loop, because state_block stores dimensional information about each line of the SRC string, such as the start position of each line, the end position of each line, and the position of the first character of each line. This information is required for a particular rule. The first part of the while statement skips the blank line and focuses on the maximum nesting level.

for (i = 0; i < len; i++) {
  ok = rules[i](state, line, endLine, false);
  if (ok) { break; }}Copy the code

In this loop, the rule chain is executed on each line of SRC to generate tokens, and if one of the rules returns true, the loop is broken and the next line is tokenized. So let’s see what these rules do. They are all located under the lib/rules_block folder.

  • table.js
module.exports = function table(state, startLine, endLine, silent) {
  var ch, lineText, pos, i, nextLine, columns, columnCount, token,
      aligns, t, tableLines, tbodyLines;

if (startLine + 2 > endLine) { return false; }

  nextLine = startLine + 1;

  if (state.sCount[nextLine] < state.blkIndent) { return false; }

  if (state.sCount[nextLine] - state.blkIndent >= 4) { return false; }

  pos = state.bMarks[nextLine] + state.tShift[nextLine];
  if (pos >= state.eMarks[nextLine]) { return false; }

  ch = state.src.charCodeAt(pos++);
  if(ch ! = =0x7C/ * | * /&& ch ! = =0x2D/ * - * /&& ch ! = =0x3A/* : */) { return false; }

  while (pos < state.eMarks[nextLine]) {
    ch = state.src.charCodeAt(pos);

    if(ch ! = =0x7C/ * | * /&& ch ! = =0x2D/ * - * /&& ch ! = =0x3A/* : */ && !isSpace(ch)) { return false; }

    pos++;
  }

  lineText = getLine(state, startLine + 1);

  columns = lineText.split('|');
  aligns = [];
  for (i = 0; i < columns.length; i++) {
    t = columns[i].trim();
    if(! t) {if (i === 0 || i === columns.length - 1) {
        continue;
      } else {
        return false; }}if (!/ ^ :? - + :? $/.test(t)) { return false; }
    if (t.charCodeAt(t.length - 1) = = =0x3A/* : */) {
      aligns.push(t.charCodeAt(0) = = =0x3A/* : */ ? 'center' : 'right');
    } else if (t.charCodeAt(0) = = =0x3A/* : */) {
      aligns.push('left');
    } else {
      aligns.push(' ');
    }
  }

  lineText = getLine(state, startLine).trim();
  if (lineText.indexOf('|') = = =- 1) { return false; }
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }
  columns = escapedSplit(lineText.replace(/^\||\|$/g.' '));

  columnCount = columns.length;
  if (columnCount > aligns.length) { return false; }

  if (silent) { return true; }

  token     = state.push('table_open'.'table'.1);
  token.map = tableLines = [ startLine, 0 ];

  token     = state.push('thead_open'.'thead'.1);
  token.map = [ startLine, startLine + 1 ];

  token     = state.push('tr_open'.'tr'.1);
  token.map = [ startLine, startLine + 1 ];

  for (i = 0; i < columns.length; i++) {
    token          = state.push('th_open'.'th'.1);
    token.map      = [ startLine, startLine + 1 ];
    if (aligns[i]) {
      token.attrs  = [ [ 'style'.'text-align:' + aligns[i] ] ];
    }

    token          = state.push('inline'.' '.0);
    token.content  = columns[i].trim();
    token.map      = [ startLine, startLine + 1 ];
    token.children = [];

    token          = state.push('th_close'.'th'.- 1);
  }

  token     = state.push('tr_close'.'tr'.- 1);
  token     = state.push('thead_close'.'thead'.- 1);

  token     = state.push('tbody_open'.'tbody'.1);
  token.map = tbodyLines = [ startLine + 2.0 ];

  for (nextLine = startLine + 2; nextLine < endLine; nextLine++) {
    if (state.sCount[nextLine] < state.blkIndent) { break; }

    lineText = getLine(state, nextLine).trim();
    if (lineText.indexOf('|') = = =- 1) { break; }
    if (state.sCount[nextLine] - state.blkIndent >= 4) { break; }
    columns = escapedSplit(lineText.replace(/^\||\|$/g.' '));

    token = state.push('tr_open'.'tr'.1);
    for (i = 0; i < columnCount; i++) {
      token          = state.push('td_open'.'td'.1);
      if (aligns[i]) {
        token.attrs  = [ [ 'style'.'text-align:' + aligns[i] ] ];
      }

      token          = state.push('inline'.' '.0);
      token.content  = columns[i] ? columns[i].trim() : ' ';
      token.children = [];

      token          = state.push('td_close'.'td'.- 1);
    }
    token = state.push('tr_close'.'tr'.- 1);
  }
  token = state.push('tbody_close'.'tbody'.- 1);
  token = state.push('table_close'.'table'.- 1);

  tableLines[1] = tbodyLines[1] = nextLine;
  state.line = nextLine;
  return true;
}
Copy the code

The table rule is used to generate the table HMTL string. Internal parsing is based on markdown write table specification and come, detailed logic will not expand here, if interested, you can write a demo to interrupt the point to try.

  • code.js
module.exports = function code(state, startLine, endLine/*, silent*/) {
  var nextLine, last, token;

  if (state.sCount[startLine] - state.blkIndent < 4) { return false; }

  last = nextLine = startLine + 1;

  while (nextLine < endLine) {
    if (state.isEmpty(nextLine)) {
      nextLine++;
      continue;
    }

    if (state.sCount[nextLine] - state.blkIndent >= 4) {
      nextLine++;
      last = nextLine;
      continue;
    }
    break;
  }

  state.line = last;

  token         = state.push('code_block'.'code'.0);
  token.content = state.getLines(startLine, last, 4 + state.blkIndent, true);
  token.map     = [ startLine, state.line ];

  return true;
};
Copy the code

The code rule is also very simple, it says that as long as you start each line with more than 3 Spaces, that’s a code_block. Like the one down here.

I am now a code_blockCopy the code
  • fence.js
module.exports = function fence(state, startLine, endLine, silent) {
  var marker, len, params, nextLine, mem, token, markup,
      haveEndMarker = false,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  if (pos + 3 > max) { return false; }

  marker = state.src.charCodeAt(pos);

  if(marker ! = =0x7E/ * ~ * /&& marker ! = =0x60 / * ` * /) {
    return false;
  }

  mem = pos;
  pos = state.skipChars(pos, marker);

  len = pos - mem;

  if (len < 3) { return false; }

  markup = state.src.slice(mem, pos);
  params = state.src.slice(pos, max);

  if (params.indexOf(String.fromCharCode(marker)) >= 0) { return false; }

  // Since start is found, we can report success here in validation mode
  if (silent) { return true; }

  // search end of block
  nextLine = startLine;

  for (;;) {
    nextLine++;
    if (nextLine >= endLine) {
      break;
    }

    pos = mem = state.bMarks[nextLine] + state.tShift[nextLine];
    max = state.eMarks[nextLine];

    if (pos < max && state.sCount[nextLine] < state.blkIndent) {
      break;
    }

    if(state.src.charCodeAt(pos) ! == marker) {continue; }

    if (state.sCount[nextLine] - state.blkIndent >= 4) {
      continue;
    }

    pos = state.skipChars(pos, marker);

    if (pos - mem < len) { continue; }

    pos = state.skipSpaces(pos);

    if (pos < max) { continue; }

    haveEndMarker = true;
    // found!
    break;
  }

  len = state.sCount[startLine];

  state.line = nextLine + (haveEndMarker ? 1 : 0);

  token         = state.push('fence'.'code'.0);
  token.info    = params;
  token.content = state.getLines(startLine + 1, nextLine, len, true);
  token.markup  = markup;
  token.map     = [ startLine, state.line ];

  return true;
};

Copy the code

Fence rule is similar to code rule. It represents code_block with language type. Examples include javascript, shell, CSS, stylus, and so on. Here’s an example:

echo 'done'
Copy the code

This parser generates a token whose type is fence, info shell, and markup “‘”.

  • blockquote.js

If the code is too long, the code will not be posted. Blockquote is used to generate tokens with markup >. Here is a blockquote.

i am a blockquote

  • hr.js
module.exports = function hr(state, startLine, endLine, silent) {
  var marker, cnt, ch, token,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  // if it's indented more than 3 spaces, it should be a code block
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  marker = state.src.charCodeAt(pos++);

  // Check hr marker
  if(marker ! = =0x2A/ * * * /&& marker ! = =0x2D/ * - * /&& marker ! = =0x5F/ * _ * /) {
    return false;
  }

  // markers can be mixed with spaces, but there should be at least 3 of them

  cnt = 1;
  while (pos < max) {
    ch = state.src.charCodeAt(pos++);
    if(ch ! == marker && ! isSpace(ch)) {return false; }
    if(ch === marker) { cnt++; }}if (cnt < 3) { return false; }

  if (silent) { return true; }

  state.line = startLine + 1;

  token        = state.push('hr'.'hr'.0);
  token.map    = [ startLine, state.line ];
  token.markup = Array(cnt + 1).join(String.fromCharCode(marker));

  return true;
};
Copy the code

An HR rule is also simple, generating a token of type HR. The markup is ***, –, ___, and all three syntax are used to parse the


tag in the MD file.

  • list.js

List is used to parse ordered and unordered lists. The detailed logic is more complex, need to understand you can debug through the demo breakpoint.

  • reference.js

Reference is used to parse hyperlinks. Our syntax at MD is similar to [Reference](http://www.baidu.con).

  • heading.js
module.exports = function heading(state, startLine, endLine, silent) {
  var ch, level, tmp, token,
      pos = state.bMarks[startLine] + state.tShift[startLine],
      max = state.eMarks[startLine];

  // if it's indented more than 3 spaces, it should be a code block
  if (state.sCount[startLine] - state.blkIndent >= 4) { return false; }

  ch  = state.src.charCodeAt(pos);

  if(ch ! = =0x23/ * # * / || pos >= max) { return false; }

  // count heading level
  level = 1;
  ch = state.src.charCodeAt(++pos);
  while (ch === 0x23/ * # * / && pos < max && level <= 6) {
    level++;
    ch = state.src.charCodeAt(++pos);
  }

  if (level > 6|| (pos < max && ! isSpace(ch))) {return false; }

  if (silent) { return true; }

  // Let's cut tails like ' ### ' from the end of string

  max = state.skipSpacesBack(max, pos);
  tmp = state.skipCharsBack(max, 0x23, pos); / / #
  if (tmp > pos && isSpace(state.src.charCodeAt(tmp - 1))) {
    max = tmp;
  }

  state.line = startLine + 1;

  token        = state.push('heading_open'.'h' + String(level), 1);
  token.markup = '# # # # # # # #'.slice(0, level);
  token.map    = [ startLine, state.line ];

  token          = state.push('inline'.' '.0);
  token.content  = state.src.slice(pos, max).trim();
  token.map      = [ startLine, state.line ];
  token.children = [];

  token        = state.push('heading_close'.'h' + String(level), - 1);
  token.markup = '# # # # # # # #'.slice(0, level);

  return true;
};
Copy the code

Heading is used to parse the heading tags (H1-H6). Its syntax is mainly #, ## and so on.

  • lheading.js

Lheading parses tags with their own delimiters, as shown below

This is a heading ======== // and it will render <h1> and this is a heading </h1>Copy the code
  • html_block.js

Html_block parses HTML. If you write an HTML tag inside md, you will still get an HTML string. For example, if you write a string like this:

let src = "<p>234</p>"

// Get the following token

let token = [
  {
    "type": "html_block"."tag": ""."attrs": null."map": [
      0.1]."nesting": 0."level": 0."children": null."content": "<p>234</p>"."markup": ""."info": ""."meta": null."block": true."hidden": false}] so does the final output string`<p>234</p>`
Copy the code
  • paragraph.js
module.exports = function paragraph(state, startLine/*, endLine*/) {
  var content, terminate, i, l, token, oldParentType,
      nextLine = startLine + 1,
      terminatorRules = state.md.block.ruler.getRules('paragraph'),
      endLine = state.lineMax;

  oldParentType = state.parentType;
  state.parentType = 'paragraph';

  // jump line-by-line until empty one or EOF
  for(; nextLine < endLine && ! state.isEmpty(nextLine); nextLine++) {// this would be a code block normally, but after paragraph
    // it's considered a lazy continuation regardless of what's there
    if (state.sCount[nextLine] - state.blkIndent > 3) { continue; }

    // quirk for blockquotes, this line should already be checked by that rule
    if (state.sCount[nextLine] < 0) { continue; }

    // Some tags can terminate paragraph without empty line.
    terminate = false;
    for (i = 0, l = terminatorRules.length; i < l; i++) {
      if (terminatorRules[i](state, nextLine, endLine, true)) {
        terminate = true;
        break; }}if (terminate) { break; }
  }

  content = state.getLines(startLine, nextLine, state.blkIndent, false).trim();

  state.line = nextLine;

  token          = state.push('paragraph_open'.'p'.1);
  token.map      = [ startLine, state.line ];

  token          = state.push('inline'.' '.0);
  token.content  = content;
  token.map      = [ startLine, state.line ];
  token.children = [];

  token          = state.push('paragraph_close'.'p'.- 1);

  state.parentType = oldParentType;

  return true;
};
Copy the code

Paragraph That is very simple and often used, is to generate p tags.

conclusion

In summary, it can be seen that the process of ParserBlock is very complicated and tedious. First, it has its own block_state, which stores a lot of the information that parserBlocks need in tokenize, and its processing takes the dimension of the SRC newline. The tokenize process then applies the different rule functions to the string line by line to generate the corresponding type of token, thus completing the ParserBlock parse process.

After ParserBlock processing, a token of type inline may be generated. This type of token is an incomplete resolved token. Here’s an example:

const src = '__ad__'

// After parse

const generatedTokens = [
  {
    "type": "paragraph_open"."tag": "p". }, {"type": "inline"."tag": ""."attrs": null."map": [
      0.1]."nesting": 0."level": 1."children": [{"type": "text"."tag": "". }, {"type": "strong_open"."tag": "strong". }, {"type": "text"."tag": "". }, {"type": "strong_close"."tag": "strong". }, {"type": "text"."tag": "". }]."content": "__ad__"."markup": ""."info": ""."meta": null."block": true."hidden": false
  },
  {
    "type": "paragraph_close". }]// The second token of the array is inline, notice that it has a children attribute
// The token on the children attribute is derived from the token.
Copy the code

__ad__ is the content attribute of the second token, indicating that the syntax in bold has not been parsed, so ParserBlock is not sufficient. We also need a more fine-grained token, so that’s where ParserInline comes in. What it does is compile a token of type inline and place a more fine-grained token on its children property, which is where the children property value of the second item of generatedTokens comes from.