视频
核心
- 遍历字符流,遇到不能识别的字符而结束本次识别时,回退一个字符,让它能被继续识别
- 重点:有限状态机三段式:(1)定义状态(2)根据现态和当前字符计算次态(3)更新现态 (和数字逻辑里的时序电路FSM设计差不多)
- 识别单词的时候分为:1-标识符关键字,2-整数,3-复合运算符,4-单独字符
- 类别码是1,2,3...,用enum在头文件里定义了。
- 单独字符的类型码统一定义在ssym数组里了,关键字的类型码统一定义在wsym数组里了。因为他们是一一对应的,用数组比较简洁。
- 报错做的很简陋,几乎没有
- 识别到token就直接输出了,万事从简(懒)...理解思想和方法就可再改进😃
- C 库函数 int isalpha( int c ):判断字符是否是字母,int isdigit ( int c ):判断字符是否是数字。当然也可以直接用ASCII码。
- C 库函数 int strcmp(const char *str1, const char *str2) 把 str1 所指向的字符串和 str2 所指向的字符串进行比较。
- 对于细节要耐心,注释和空格回车等字符的处理,获取下一个字符的时机🥰
图里没有画注释了,因为最后才发现把注释识别放在自动机里特别好用。
my2.c
#include "my2.h"
void error(int n)
{
printf("Error %3d: %s\n", n, err_msg[n]);
}
void lexer(FILE *fp)
{
int num = 0; //当前识别中的数字
int k = 0; //当前识别中的数字的长度
char a[MAXIDLEN + 1]; //当前识别中的标识符or关键字
int a_index = 0; //当前识别中的标识符or关键字的下标
ch = fgetc(fp); //获取文件第一个字符
while (ch != EOF)
{
switch (currState)
{
case START:
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')
{ //不能在switch外面忽略这些字符,在这里,他们不是无效的,他们可以表示标识符的结束等
}
else if (ch == '{')
{ //注释{}
currState = COMMENT;
}
else if (isdigit(ch))
{
currState = INNUM;
num = num * 10 + ch - '0';
k++;
}
else if (isalpha(ch))
{
currState = INID;
if (a_index > MAXIDLEN)
{
error(26);
exit(1);
}
a[a_index] = ch;
a_index++;
}
else if (ch == ':')
currState = INBECOMES;
else if (ch == '>')
currState = GTR;
else if (ch == '<')
currState = LES;
else
{ //单独字符直接识别
currState = START;
int i = 1;
for (; i <= NSYM; i++)
{
if (ch == csym[i])
break;
}
if (i <= NSYM)
{
sym = ssym[i];
printf("(%d,%c)\n", sym, ch);
}
else
{
error(0);
// exit(1);
printf("the char is ---%c---\n", ch);
}
}
break;
case INNUM:
if (isdigit(ch))
{
num = num * 10 + ch - '0';
}
else
{ //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_NUMBER;
if (k > MAXNUMLEN)
error(25);
else
{
printf("(%d,%d)\n", sym, num);
}
k = 0;
num = 0;
}
break;
case COMMENT:
if (ch == '}')
{ // 注释结束
currState = START;
}
break;
case INID:
if (isalpha(ch) || isdigit(ch))
{
if (a_index > MAXIDLEN)
{
error(26);
exit(1);
}
a[a_index] = ch;
a_index++;
}
else
{ //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
a[a_index] = '\0'; // 字符数组和字符串的区别就是结尾少了\0,一定要加上!
// 检查是否为关键字
int i = 1;
for (; i <= NRW; i++)
{
if (strcmp(a, word[i]) == 0)
break;
}
if (i <= NRW)
{
sym = wsym[i]; // symbol is a reserved word
}
else
{
sym = SYM_IDENTIFIER; // symbol is an identifier
}
printf("(%d,%s)\n", sym, a);
a_index = 0;
}
break;
case INBECOMES:
if (ch == '=')
{
currState = BECOMES;
}
else
{
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_NULL;
}
break;
case GTR:
if (ch == '=')
{
currState = GEQ;
}
else
{ //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_GTR;
printf("(%d,>)\n", sym);
}
break;
case LES:
if (ch == '=')
{
currState = LEQ;
}
else
{ //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_LES;
printf("(%d,<)\n", sym);
}
break;
case BECOMES: //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_BECOMES;
printf("(%d,:=)\n", sym);
break;
case GEQ: //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_GEQ;
printf("%d,>=\n", sym);
break;
case LEQ: //token识别完毕
currState = START;
ch = ungetc(ch, fp); // 回退该字符,重新识别
sym = SYM_LEQ;
printf("%d,<=\n", sym);
break;
}
//在最后获取下一个字符
ch = fgetc(fp);
}
printf("—————文件读取结束—————");
}
int main()
{
//获取待检验文件的指针
FILE *fp = fopen("source.txt", "r");
if (!fp)
{
printf("文件不存在");
}
else //将待检验文件放入词法分析器进行分析
lexer(fp);
return 0;
}
my2.h
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define NRW 11 // number of reserved words
#define MAXNUMLEN 14 // maximum number of digits in numbers
#define NSYM 10 // maximum number of symbols in array ssym and csym
#define MAXIDLEN 10 // length of identifiers
char ch; // last character read
int sym; // last symbol read
//相比上一篇文章的代码,本头文件中增加了状态state和现态currState
enum state {
START,INNUM,INID,INBECOMES,BECOMES,GTR,GEQ,NEQ,LES,LEQ,END,COMMENT
};
int currState = START; //现态
char csym[NSYM + 1] = {
' ', '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
};
//关键字
char* word[NRW + 1] = {
"", /* place holder */
"begin", "call", "const", "do", "end","if",
"odd", "procedure", "then", "var", "while"
};
//类别码
enum symtype {
SYM_NULL, SYM_IDENTIFIER, SYM_NUMBER, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH, SYM_ODD, SYM_EQU, SYM_NEQ, SYM_LES, SYM_LEQ, SYM_GTR, SYM_GEQ, SYM_LPAREN, SYM_RPAREN, SYM_COMMA, SYM_SEMICOLON, SYM_PERIOD, SYM_BECOMES, SYM_BEGIN, SYM_END, SYM_IF, SYM_THEN, SYM_WHILE, SYM_DO, SYM_CALL, SYM_CONST, SYM_VAR, SYM_PROCEDURE
};
int wsym[NRW + 1] = {
SYM_NULL, SYM_BEGIN, SYM_CALL, SYM_CONST, SYM_DO, SYM_END,
SYM_IF, SYM_ODD, SYM_PROCEDURE, SYM_THEN, SYM_VAR, SYM_WHILE
};
int ssym[NSYM + 1] = {//
SYM_NULL, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH,
SYM_LPAREN, SYM_RPAREN, SYM_EQU, SYM_COMMA, SYM_PERIOD, SYM_SEMICOLON
};
//报错信息(相比上一篇文章随意地增加了0、26)
char* err_msg[] =
{
/* 0 */ "Fatal Error:Unknown character.\n",
/* 1 */ "Found ':=' when expecting '='.",
/* 2 */ "There must be a number to follow '='.",
/* 3 */ "There must be an '=' to follow the identifier.",
/* 4 */ "There must be an identifier to follow 'const', 'var', or 'procedure'.",
/* 5 */ "Missing ',' or ';'.",
/* 6 */ "Incorrect procedure name.",
/* 7 */ "Statement expected.",
/* 8 */ "Follow the statement is an incorrect symbol.",
/* 9 */ "'.' expected.",
/* 10 */ "';' expected.",
/* 11 */ "Undeclared identifier.",
/* 12 */ "Illegal assignment.",
/* 13 */ "':=' expected.",
/* 14 */ "There must be an identifier to follow the 'call'.",
/* 15 */ "A constant or variable can not be called.",
/* 16 */ "'then' expected.",
/* 17 */ "';' or 'end' expected.",
/* 18 */ "'do' expected.",
/* 19 */ "Incorrect symbol.",
/* 20 */ "Relative operators expected.",
/* 21 */ "Procedure identifier can not be in an expression.",
/* 22 */ "Missing ')'.",
/* 23 */ "The symbol can not be followed by a factor.",
/* 24 */ "The symbol can not be as the beginning of an expression.",
/* 25 */ "The number is too great.",
/* 26 */ "The identifier is too long",
/* 27 */ "",
/* 28 */ "",
/* 29 */ "",
/* 30 */ "",
/* 31 */ "",
/* 32 */ "There are too many levels."
};
source.txt
const a=10; {常量声明}
const b=20;
var c; {变量声明}
procedure p; {过程声明}
begin
c:=b+a
end;
begin
call p
end.
感想
断断续续折腾了两周,参考了好多文章,走了山路十八弯😡
看到过一篇把转换表做成二维数组的,字符数*状态数,太多了,就没用,本质和switch是一样的!
DFA太好用了!!!😭 值得!!!
现在也许还有bug,欢迎交流指正~