此文已由作者杨介正授权网易云社区发布。
欢迎访问网易云社区,了解更多网易技术产品运营经验。
只是读书总结,并不是发明,但伴随一点点创造,不对之处欢迎大家交流指正。开门进山,实现虚拟树需要分两步:第一步是写一个虚拟DOM类,第二步是写一个html解析器去解析html字符串,然后以虚拟DOM为节点建立一颗虚拟树。
<div id="a">hello</div>
如何解析上面这段html字符串呢,也就是说如何去验证上面这段html的正确性并提取出有用的关键信息如<div、id="a"等然后转化成我们想要的虚拟DOM树。开动脑筋想,方法肯定是N多。这里介绍一种学来加上自己简化的方法:分为两步,用lexer(词法分析器)去提取关键词<div、id="a"、>、hello、</div>,并验证html的正确性。然后用parser(语法分析器)去分析lexer提取来的关键词并根据html标准解析成虚拟DOM树。为什么要分两步,不能边提取边解析吗,也可以。但是html的结构可能本身是错误的,如果把这个错误在lex阶段就中断掉,不是更好吗?那么问题来了,lexer怎么写。简单的说就是遍历html字符串,根据“当前状态”提取关键词。遍历html字符串可以用while循环加string.charAt方法,这里的当前状态重点解释一下。如果大家接触过状态机,就会很好理解,没有接触过的话(我也没接触过),也没关系,我通俗的解释一下。比如有beginStartTag(开始标签开始状态,如<div)、attribute(属性状态,如id="a")、endStartTag(开始标签结束状态,如>)、startEndTag(结束标签开始状态,如</)、endEndTag(结束标签结束状态,如>)5个状态。这5个状态之间的关系如下图
[
{
type: 'tagStart',
tagname: 'div',
nodeType: '1'
},
{
type: 'attribute',
name: 'id',
value: 'a'
nodeType: '2'
},
{
type: 'tagEnd'
},
]
define([
'./eventEmitter',
'../util/util',
'./vDom'
],function(EventEmitter, u, VDom){
var
Lexer = EventEmitter._$extend({
__init: function(options){
this.__super(options);
this.__state = 'beginStartTag';
this.__tagName = [];
},
_error: function(type){
var
message = '',
i;
for(i = -10; i < 10 ; i++){
message += this._peek(i);
}
throw ('html syntax error at position' + this.__index + ' and constructor:' + message + ' status: ' + type);
this.__state = 'error';
},
_states: {
beginStartTag: function(ch){
var
name;
if(ch != '<'){
//textNode
this.__tokens.push({
type: 'startTag',
nodeType: 3,
tagName: 'text',
text: this.__readIdent(ch, function(ch){
return ch != '<';
})
});
this.__tokens.push({
type: 'endTag',
tagName: 'text'
});
this.__state = 'beginStartTag';
}else if(ch == '<' && this._peek(1) == '/'){
this.__state = 'beginEndTag';
this.__index ++;
}else if(ch == '<' && this._isIdent(this._peek(1))){
name = this.__readIdent();
this.__tokens.push({
type: 'startTag',
tagName: name,
nodeType: '1'
});
this.__state = 'attribute';
this.__tagName.unshift(name);
}else {
this._error('beginStartTag');
}
},
attribute: function(ch){
var
tagName = this.__tagName[0],
name,
value = '',
beginAttributeValue,
singleQuotes,
doubleQuotes;
if(ch == '/' && this._is('>', this._peek(1)) && (tagName == 'img' || tagName == 'input') ){
this.__state = 'endSelfEndTag';
return;
}
// > transfer to beginStartTag status
else if(ch == '>'){
this.__state = 'beginStartTag';
return;
}else if(this._isIdent(ch)){
name = this.__readIdent(ch);
}
while(this.__index < this.__length){
ch = this._peek(1);
if(ch == '/' && this._is('>', this._peek(1)) && (tagName == 'img' || tagName == 'input') ){
this.__state = 'endSelfEndTag';
break;
}
// > transfer to endStartTag status
else if(ch == '>'){
this.__state = 'endStartTag';
break;
}
else if(!beginAttributeValue){
if( this._isWhite(ch) ){
this.__index ++;
continue;
}else if(this._isIdent(ch)){
this.__state = 'attribute';
break;
}else if(ch == '='){
beginAttributeValue = true;
this.__index ++;
}else{
this._error('!beginAttributeValue');
break;
}
}else if(beginAttributeValue){
if(!singleQuotes && !doubleQuotes){
if( this._isWhite(ch) ){
this.__index ++;
continue;
}else if(ch == '\''){
singleQuotes = true;
this.__index ++;
}else if(ch == '\"'){
doubleQuotes = true;
this.__index ++;
}else{
this._error('beginAttributeValue');
break;
}
}else{
value = this.__readIdent('', function(ch){
return ch != (singleQuotes ? '\'' : '\"');
});
this.__state = 'attribute';
this.__index ++;
break;
}
}else{
this._error('attribute');
break;
}
}
this.__tokens.push({
type: 'attribute',
name: name,
value: value
});
},
endStartTag: function(ch){
this.__state = 'beginStartTag';
},
beginEndTag: function(ch){
name = this.__readIdent(ch);
if(name == this.__tagName[0]){
this.__state = 'endEndTag';
}else{
this._error('beginEndTag');
}
},
endEndTag: function(ch){
if(ch == '>'){
this.__state = 'beginStartTag';
this.__tokens.push({
type: 'endTag',
tagName: this.__tagName[0]
});
this.__tagName.shift();
}else{
this._error('endEndTag');
}
},
endSelfEndTag: function(ch){
this.__state = 'beginStartTag';
this.__tokens.push({
type: 'endTag',
tagName: this.__tagName[0]
});
this.__tagName.shift();
}
},
_peek: function(num){
num = num || 0;
if(this.__index + num < this.__text.length){
return this.__text.charAt(this.__index + num);
}
return undefined;
},
_is: function(chars, ch){
return chars.indexOf(ch || this.__ch) >= 0;
},
_isWhite: function(ch){
return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\v';
},
_isNotWhite: function(ch){
return !this._isWhite(ch);
},
_isNumber: function(ch){
return ch >= '0' && ch <= '9';
},
_isString: function(ch){
return ch == '\'' || ch == '\"';
},
_isIdent: function(ch){
return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch == '_';
},
_$lex: function(text){
this.__tokens = [];
this.__text = text;
this.__length = text.length;
this.__index = 0,
this.__ch = undefined;
while(this.__index < this.__length && this.__state != 'error'){
this.__ch = this._peek();
if(this._isWhite(this.__ch)){
this.__index ++;
continue;
}else{
this._states[this.__state].call(this, this.__ch);
this.__index ++;
}
}
return this.__tokens;
},
__readIdent: function(ch, _is){
var
index = this.__index,
text = ch || '';
_is = _is || this._isIdent;
while(this.__index < this.__length){
ch = this._peek(1);
if(_is.call(this, ch)){
text += ch;
this.__index ++;
}else{
break;
}
}
return text;
}
});
var
HtmlParser = EventEmitter._$extend({
__init: function(){
this.__super();
},
_$parse: function(text){
var
fn;
this.__lexer = new Lexer();
this.__tokens = this.__lexer._$lex(text);
fn = this._statements();
return fn;
},
_statements: function(){
return this._consume();
},
_consume: function(){
var
token,
parentNode = [],
node,
type,
tagName,
nodeType,
attrName,
attrValue,
text;
while(this.__tokens.length){
token = this.__tokens.shift();
type = token.type;
if(type == 'startTag'){
node = new VDom(token);
if(parentNode[0]){
parentNode[0].appendChild(node);
}
parentNode.unshift(node);
}else if(type == 'attribute'){
attrName = token.name;
attrValue = token.value;
node.setAttribute(attrName, attrValue);
}else if(type == 'endTag'){
node = parentNode.shift();
}
}
return node;
}
});
return HtmlParser;
})
网易云免费体验馆,0成本体验20+款云产品!
更多网易技术、产品、运营经验分享请点击。