编程手记之ANSI C篇-(四)XML解析自动机
在实现了通用连接件、哈希表和二叉分析树后,我可以用这些功能做一个XML分析工具了。XML以其标准的结构,严谨的文法,强大的描述能力,以及独立和开放性,在描述数据资源方面得到了广泛的应用。在各种语言中,XML文档(DOM)的分析工具已层出不穷,在此,从文法的角度,来完成一个XML分析自动机的C实现。
1、XML基本文法:
/*XML脚本有XML的声明和一个根结点构成*/
XMLScripts --> XMLNote + XMLEntity
/*XML声明由XML标识首部、零个或多个XML属性节组成,之间空格符隔开,最后是XML标识尾部*/
XMLNote --> '<?xml' + { BLANK + XMLAttr} + '?>'
/*XML节点由节点首部、节点文本、零个或多个子节点和节点尾部组成,没有文本、自节点的节点称为空节点*/
XMLEntity --> TagHead + [ TagText + {XMLEntity} + TagTail ]
/*节点首部由节点开始符、节点名称、零个或多个属性节组成,没有文本和子节点的节点可由终止符后缀*/
TagHead --> '<' + TagName + {BLANK + XMLAttr} + [ '/>' | '' ]
/*节点文本由字符串组成*/
TagText --> {'a | b c| ...0 | 1 | ..'}
/*节点尾部由节点终止符和节点名称组成*/
TagTail --> '</' + TagName + '>'
/*节点名称由字符串组成*/
TagName --> {'a | b c| ...0 | 1 | ..'}
/*属性节属性名称、赋值符、首尾扩号和属性值组成*/
XMLAttr --> AttrName + '=' + '"' + AttrValue + '"'
/*属性名称由字符串组成*/
AttrName --> {'a | b c| ...0 | 1 | ..'}
/*属性值由字符串组成*/
AttrValue --> {'a | b c| ...0 | 1 | ..'}
/*空格符由下列符号组成*/
BLANK --> {' ' | '\t' | '\r' | '\n'}
2、XML解析自动机的定义:
/*定义自动机返回状态码*/
#define XP_SUCCESS 0
#define XP_CONTINUE 1
#define XP_ERROR -1
/*定义自动机当前的操作码*/
typedef enum{paChild = 0,paSibling = 1,paAttr = 2}XMLParseAction;
/*自动机器的数据结构*/
typedef struct tagXMLMac{
LINKPTR tree; /*二叉树用以维系节点关系和存储节点属性*/
LINKPTR parent; /*指向当前分析的父节点的,用于回溯*/
XMLParseAction act; /*当前自动机操作码*/
int retcode; /*自动机状态码*/
TCHAR* token; /*当前分析位置的字符串指针*/
}XMLMac;
/*定义一些节点的固有属性*/
#define NODENAME _T("NodeName")
#define NODETEXT _T("NodeText")
#define NODETYPE _T("NodeType")
/*定义一些常用的固定符号*/
#define XMLNS _T("xmlns:")
#define NSS _T(':')
#define ASIGN _T('=')
#define QUATE _T('"')
/*定义终止符号集合的停止符*/
#define NILL _T('\x02')
/*定义空格同意符*/
static TCHAR BlankSign[] = {_T(' '),_T('?'),_T('\t'),'\r',_T('\n'),NILL};
/*定义节点文本终止符*/
static TCHAR TextTerm[] = {_T('<'),_T('\0'),NILL};
/*定义属性终止符*/
static TCHAR AttrTerm[] = {_T('"'),/*_T('>'),*/_T('\0'),NILL};
/*定义节点首部终止符*/
static TCHAR TagHeadTerm[] = {_T(' '),_T('/'),_T('>'),_T('\t'),_T('\r'),_T('\n'),_T('\0'),NILL};
/*定义节点尾部终止符*/
static TCHAR TagTailTerm[] = {_T('>'),_T('\0'),NILL};
/*定义一些节点类型*/
typedef enum{ttENT = 0,ttXML = 1,ttCMT = 2, ttELE = 3, ttCDA = 4, ttDOC = 5, ttEXT = 6, ttNOT = 7}TagType;
#define TT_ENT _T("<") /*define normal entity tag*/
#define TT_XML _T("<?xml") /*define xml root tag*/
#define TT_CMT _T("<!--") /*define comments tag*/
#define TT_ELE _T("<!ELEMENT") /*define data definition element tag*/
#define TT_CDA _T("<![CDATA[") /*define fregment data envelope*/
#define TT_DOC _T("<!DOCTYPE") /*define xml dtd source*/
#define TT_EXT _T("<!ENTITY") /*define outside entity*/
#define TT_NOT _T("<!NOTATION") /*define notation tag*/
3、定义解析过程实现:
/*测试字符是否是空格符*/
int _IsBlankSign(TCHAR ch)
{
int i = 0;
while(BlankSign[i] != NILL)
{
if(ch == BlankSign[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点首部终止符*/
int _IsTagHeadTerm(TCHAR ch)
{
int i = 0;
while(TagHeadTerm[i] != NILL)
{
if(ch == TagHeadTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点尾部终止符*/
int _IsTagTailTerm(TCHAR ch)
{
int i = 0;
while(TagTailTerm[i] != NILL)
{
if(ch == TagTailTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是节点文本终止符*/
int _IsTextTerm(TCHAR ch)
{
int i = 0;
while(TextTerm[i] != NILL)
{
if(ch == TextTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符是否是属性值终止符*/
int _IsAttrTerm(TCHAR ch)
{
int i = 0;
while(AttrTerm[i] != NILL)
{
if(ch == AttrTerm[i])
return 1;
i++;
}
return 0;
}
/*测试字符串首是否包括XML名域*/
int _IsNameSpace(TCHAR* key)
{
TCHAR* token = key;
int len;
len = _tcslen(XMLNS);
if(_tcsncpy(token,XMLNS,len) == 0)
return 1;
else
return 0;
}
/*测试节点类型*/
int _TagType(TCHAR* sz)
{
if(!_tcsncmp(sz,TT_XML,_tcslen(TT_XML)))
return ttXML;
else if(!_tcsncmp(sz,TT_CMT,_tcslen(TT_CMT)))
return ttCMT;
else if(!_tcsncmp(sz,TT_ELE,_tcslen(TT_ELE)))
return ttELE;
else if(!_tcsncmp(sz,TT_CDA,_tcslen(TT_CDA)))
return ttCDA;
else if(!_tcsncmp(sz,TT_DOC,_tcslen(TT_DOC)))
return ttDOC;
else if(!_tcsncmp(sz,TT_EXT,_tcslen(TT_EXT)))
return ttEXT;
else if(!_tcsncmp(sz,TT_NOT,_tcslen(TT_NOT)))
return ttNOT;
else if(!_tcsncmp(sz,TT_ENT,_tcslen(TT_ENT)))
return ttENT;
else
return -1;
}
/*越过空格符*/
TCHAR* _XMLSkipBlank(TCHAR* szXML)
{
TCHAR* token = szXML;
while(_IsBlankSign(*token))
token ++;
if(*token == _T('\0'))
return NULL;
else
return token;
}
/*越过XML声明节,如 <?xml ...>*/
TCHAR* _XMLSkipXML(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_XML);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过注释节,如 <!-- ... -->*/
TCHAR* _XMLSkipCMT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_CMT);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过实体声明节,如 <!ELEMENT ...>*/
TCHAR* _XMLSkipELE(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_ELE);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过CDATA节,如 <!CDATA[[...]]>*/
TCHAR* _XMLSkipCDA(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_CDA);
while(*token != _T(']') && *(token + 1) != _T(']') && *token != _T('\0'))
token ++;
if(*token == _T(']'))
return token + 2; /*skip ']]'*/
else
return token;
}
/*越过文档声明节,如 <!DOCTYPE ...>*/
TCHAR* _XMLSkipDOC(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_DOC);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过外部实体声明节,如 <!ENTITY ...>*/
TCHAR* _XMLSkipEXT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_EXT);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过其他外部声明节,如 <!NOTATION ...>*/
TCHAR* _XMLSkipNOT(TCHAR* szXML)
{
TCHAR* token = szXML + _tcslen(TT_NOT);
while(*token != _T('>') && *token != _T('\0'))
token ++;
if(*token == _T('>'))
return token + 1; /*skip '<'*/
else
return token;
}
/*越过赋值符,如 ... = "..."*/
TCHAR* _XMLSkipAsign(TCHAR* szXML)
{
TCHAR* token = szXML;
token = _XMLSkipBlank(token);
if(*token != ASIGN)
return NULL;
token ++;
return _XMLSkipBlank(token);
}
/*越过节点首部,如 <sometag> */
TCHAR* _XMLSkipTagHeader(TCHAR* szXML)
{
TCHAR* token = szXML + 1;
assert(*szXML == _T('<'));
token = _XMLSkipBlank(token);
if(token == NULL || *token == _T('>'))
return NULL;
while(!_IsTagHeadTerm(*token))
token ++;
if(*token == _T('\0'))
return NULL;
return token;
}
/*越过节点尾部并回溯嵌套节点,如 </sometag1></sometag2>...*/
void _XMLSkipTagTail(XMLMac* pm)
{
TCHAR* token = pm->token;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_SUCCESS;
return;
}
if(*token == _T('/') || (*token == _T('<') && *(token + 1) == _T('/')))
{
while(!_IsTagTailTerm(*token))
token ++;
if(*token == _T('\0'))
{
pm->retcode = XP_ERROR;
return;
}
token ++ ;//skip _T('>')
/*回溯到父节点*/
pm->parent = GetTreeDataParentItem(pm->parent);
pm->token = token;
pm->act = paSibling;
_XMLSkipTagTail(pm);
}else
{
pm->token = token;
}
}
/*分离名域和节点名称,如 'xsl:entname'*/
void _SplitNameSpace(TCHAR* sz,TCHAR** ns,int* nslen,TCHAR** ent,int* entlen)
{
TCHAR* token = sz;
*ns = *ent = NULL;
*nslen = *entlen = 0;
while(!_IsTagHeadTerm(*token) && *token != NSS)
token ++;
if(*token == NSS)
{
*ns = sz;
*nslen = token - sz;
token ++; /*skip NSS ':'*/
*ent = token;
while(!_IsTagHeadTerm(*token))
{
token ++;
*entlen = *entlen + 1;
}
}else
{
*ent = sz;
*entlen = token - sz;
}
}
/*解析节点的属性集合*/
void _XMLParseAttr(XMLMac* pm)
{
TCHAR* token = pm->token;
int keylen,vallen;
TCHAR* key;
TCHAR* val;
key = pm->token;
while(!_IsBlankSign(*token) && *token != ASIGN)
token ++;
keylen = token - pm->token ;
token = _XMLSkipAsign(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
if(*token == QUATE) /*skip left QUATE '"'*/
token ++;
val = token;
while(!_IsAttrTerm(*token))
token ++;
vallen = token - val ;
WriteTreeDataItemProper(pm->parent,key,keylen,val,vallen);
if(*token == QUATE) /*skip right QUATE '"'*/
token ++;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
/*test entity property set is terminated*/
if(*token == _T('>') || *token == _T('/'))
{
pm->retcode = XP_SUCCESS;
pm->token = token ;
return;
}
pm->token = token;
pm->retcode = XP_CONTINUE;
_XMLParseAttr(pm);
}
/*解析节点文本,如 <sometag>sometext</sometag>*/
void _XMLParseTagText(XMLMac* pm)
{
TCHAR* token = pm->token;
TCHAR* val;
int vallen;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
val = token;
while(!_IsTextTerm(*token))
token ++;
if(*token == _T('\0'))
{
pm->retcode = XP_ERROR;
return;
}
vallen = token - val;
WriteTreeDataItemProper(pm->parent,NODETEXT,-1,val,vallen);
pm->token = token;
pm->retcode = XP_SUCCESS;
}
/*解析节点和他的子节点*/
void _XMLParseTagEntity(XMLMac* pm)
{
TCHAR* token = pm->token;
TCHAR* tag;
TCHAR* ns;
TCHAR* ent;
int len,nslen,entlen;
int tt;
LINKPTR item;
token = _XMLSkipBlank(token);
if(token == NULL)
{
if(pm->parent == NULL)
pm->retcode = XP_SUCCESS; //no more to parse
else
pm->retcode = XP_ERROR ; //lost some tag
return;
}
tt = _TagType(token);
if(tt < 0)
{
pm->retcode = XP_ERROR; /*invalid entity header*/
return;
}
/*部分节点类型在此不作分析*/
if(tt == ttXML)
{
token = _XMLSkipXML(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttCMT)
{
token = _XMLSkipCMT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttELE)
{
token = _XMLSkipELE(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttCDA)
{
token = _XMLSkipCDA(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttDOC)
{
token = _XMLSkipDOC(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttEXT)
{
token = _XMLSkipEXT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}else if(tt == ttNOT)
{
token = _XMLSkipNOT(token); /*do nothing*/
pm->token = token;
_XMLParseTagEntity(pm);
return;
}
/*开始分析节点*/
tag = token + 1; /*skip '<'*/
token = _XMLSkipTagHeader(token);
if(token == NULL)
{
pm->retcode = XP_ERROR; /*invalid entry body*/
return;
}
len = token - tag;
item = InsertTreeDataItem(pm->tree,pm->parent,LINK_LAST);
/*分析节点名域和名称*/
_SplitNameSpace(tag,&ns,&nslen,&ent,&entlen);
if(nslen == 0)
WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen);
else
{
WriteTreeDataItemProper(item,NODENAME,-1,ent,entlen);
WriteTreeDataItemProper(item,XMLNS,-1,ns,nslen);
}
/*新节点并作为当前父节点,接着去分析下一个节点*/
pm->parent = item;
token = _XMLSkipBlank(token);
if(token == NULL)
{
pm->retcode = XP_ERROR;
return;
}
if(*token != _T('>')) /*节点首部有属性集合*/
{
if(*token != _T('/')) /*非空节点,以下作节点属性集合分析*/
{
pm->act = paAttr;
pm->token = token;
_XMLParseAttr(pm);
if(pm->retcode == XP_ERROR)
return;
}else /*空节点*/
pm->token = token;
token = pm->token;
if(*token == _T('/')) /*空节点,则完成该节点分析*/
{
pm->token = token;
_XMLSkipTagTail(pm); /*完成空节点分析并回溯*/
if(pm->retcode == XP_ERROR)
{
return;
}
_XMLParseTagEntity(pm); /*分析下一个兄弟节点*/
return;
}else
token ++; /*skip _T('>')*/
}else
{
token ++; //skip _T('>')
pm->token = token;
}
/*分析节点文本*/
pm->token = token;
pm->retcode = XP_CONTINUE;
_XMLParseTagText(pm);
if(pm->retcode == XP_ERROR)
return;
token = pm->token;
assert(*token == _T('<'));
if(*(token + 1) != _T('/')) /*该节点有子节点*/
{
pm->token = token;
pm->act = paChild;
pm->retcode = XP_CONTINUE;
_XMLParseTagEntity(pm); /*分析子节点*/
}else /*没有子节点*/
{
pm->token = token;
_XMLSkipTagTail(pm); /*完成该节点分析并回溯*/
if(pm->retcode == XP_ERROR)
{
return;
}
_XMLParseTagEntity(pm); /*去分析下一个兄弟节点*/
}
}
4、以上的实现只是沧海一粟,更多的功能有待进一步去实现,让我们共同努力吧!我的MAIL:jdhot2003@hotmail.com