这学期的编译原理课语法分析已经快讲完了。整个编译器的前端主要就包括了词法分析和语法分析两个部分。现在自己开始着手用C语言写编译器的前端。以下是程序的说明。
1)该程序是词法分析部分Lexical Analyse。
2)只支持无符号整数类型。不支持数组和指针等高级特性。
3)不支持函数。注释为C++式的“//”。
4)使用VC++.NET编译。
//////////////////////////////Lexical.h////////////////////////////////////
#ifndef _LEXICAL_H_
#define _LEXICAL_H_
//状态机状态枚举
typedef enum {START=0,UNKNOWN=0,COMMENT,NUMBER,IDENTI,IF,ELSE,END,REPEAT,UNTIL,ASSIGN,EQUAL,PLUS,MINUS,MULTI,DIVIDE,LESS,GREATER,LPAREN,RPAREN,SEMI,EXPRESSION} LexTokenType;
//状态转换结构
typedef struct {LexTokenType beg; char Domainbeg; char Domainend; LexTokenType end;} TransState;
//关键字描述结构
typedef struct {char strExp[10]; LexTokenType type;} TokenDes;
//记号结构
typedef struct {char *strName; LexTokenType type;} LexToken;
LexToken* GetNextToken(char *pSource);
#endif//_LEXICAL_H_
//////////////////////////////Lexical.c////////////////////////////////////
#include <stdio.h>
#include <malloc.h>
#include <string.h>
#include "lexical.h"
//有限状态机状态转换描述
TransState trans[] ={
{NUMBER,'0','9',NUMBER},
{IDENTI,'A','Z',IDENTI},
{IDENTI,'a','z',IDENTI},
{IDENTI,'0','9',IDENTI},
{START,' ',' ',START},
{START,'\t','\t',START},
{START,'\n','\n',START},
{START,'\r','\r',START},
{START,'0','9',NUMBER},
{START,'A','Z',IDENTI},
{START,'a','z',IDENTI},
{START,'+','+',PLUS},
{START,'-','-',MINUS},
{START,'*','*',MULTI},
{START,'/','/',DIVIDE},
{START,'<','<',LESS},
{START,'>','>',GREATER},
{START,'(','(',LPAREN},
{START,')',')',RPAREN},
{START,';',';',SEMI},
{START,'=','=',ASSIGN},
{DIVIDE,'/','/',COMMENT},
{COMMENT,'\n','\n',START},
{COMMENT,'\n'+1,127,COMMENT},
{COMMENT,1,'\n'-1,COMMENT},
{COMMENT,-128,-1,COMMENT},
{ASSIGN,'=','=',EQUAL},
};
//关键字(保留字)描述
TokenDes reserve[] ={
{"if",IF},
{"else",ELSE},
{"end",END},
{"repeat",REPEAT},
{"until",UNTIL},
};
LexToken* GetNextToken(char *pSource)
{//考虑速度,不检测file合法性
static int LineNO = 0;
static int PosNow = 0;
static int SizeStateTrans = sizeof(trans)/sizeof(TransState);
static int SizeReserve = sizeof(reserve)/sizeof(TokenDes);
static int SizeToken = sizeof(LexToken);
LexToken *ptoken = NULL;
LexTokenType CurState = START;
int PosStart = PosNow;
int i,j,tokenlen=0;
char ch;
ptoken = (LexToken*)malloc(SizeToken);//省略错误检查
while (1)
{
ch = pSource[PosNow];
if (ch == '\n')//如果碰到回车字符,行号加1
LineNO++;
for (i=0; i<SizeStateTrans; i++)
{
if ((CurState==trans[i].beg) && (ch>=trans[i].Domainbeg) && (ch<=trans[i].Domainend))//满足该状态转换
{
CurState = trans[i].end;//转换到该状态
break;//跳出for循环,准备察看下一个字符
}
}
if (i == SizeStateTrans)//未找到合适的状态转换
{
if (CurState == START)//如果开始于START状态
{
if (ch == '\0')//遇到文件尾
{
free(ptoken);
return NULL;
}
ptoken->type = UNKNOWN;//认定为:遇到未知字符
ptoken->strName = NULL;
return ptoken;
}
else//找到一个新的记号
{
if (CurState == COMMENT)//这种情况出现在代码以注释结束时
{
free(ptoken);
return NULL;
}
tokenlen = PosNow - PosStart;
ptoken->strName = (char*)malloc(tokenlen+1);
strncpy(ptoken->strName, pSource+PosStart, tokenlen);
ptoken->strName[tokenlen] = '\0';
ptoken->type = CurState;
if (ptoken->type == IDENTI)//如果该记号被认为是标识符
{
for (j=0; j<SizeReserve; j++)
{
if (!strcmp(ptoken->strName, reserve[j].strExp))//如果在关键字中找到该标识符
{
ptoken->type = reserve[j].type;
break;
}
}
}
return ptoken;
}
}
else//转换了状态
{
PosNow++;//消耗掉该字符
if (CurState == START)//如果又转换到了START状态
PosStart = PosNow;
}
}
}
///////////////////////////////////main.c////////////////////////////////////////
#include <windows.h>
#include <stdio.h>
#include "Lexical.h"
int main ()
{
HANDLE file = CreateFile("source.txt",GENERIC_READ,0,NULL,OPEN_EXISTING,0,NULL);
long size = GetFileSize(file,NULL);
char *pBuffer = (char*)malloc(size+1);
long sizenow;
LexToken* ptoken;
ReadFile(file,pBuffer,size,&sizenow,NULL);
pBuffer[size] = '\0';
while (ptoken=GetNextToken(pBuffer))
{
printf("%10s, %d\n",ptoken->strName,ptoken->type);
}
return 0;
}
//////////////以下是测试文本(ANSI格式)///////////////////
//用来测试程序,这里是注释
if today is monday
and 1+1=23
2123/4==6
///////////////////以下是输出(节点名称和节点类型编号)/////////////////////////
if, 4
today, 3
is, 3
monday, 3
and, 3
1, 2
+, 11
1, 2
=, 9
23, 2
2123, 2
/, 14
4, 2
==, 10
6, 2