【编译原理】PL/0编译程序之词法分析 | DFA | C语言实现类别码是1,2,3...，用enum在头文件里定义了。

视频

核心

遍历字符流，遇到不能识别的字符而结束本次识别时，回退一个字符，让它能被继续识别
重点：有限状态机三段式：（1）定义状态（2）根据现态和当前字符计算次态（3）更新现态 （和数字逻辑里的时序电路FSM设计差不多）
识别单词的时候分为：1-标识符关键字,2-整数,3-复合运算符,4-单独字符
类别码是1,2,3...，用enum在头文件里定义了。
单独字符的类型码统一定义在ssym数组里了，关键字的类型码统一定义在wsym数组里了。因为他们是一一对应的，用数组比较简洁。
报错做的很简陋，几乎没有
识别到token就直接输出了，万事从简（懒）...理解思想和方法就可再改进😃
C 库函数 int isalpha( int c )：判断字符是否是字母，int isdigit ( int c )：判断字符是否是数字。当然也可以直接用ASCII码。
C 库函数 int strcmp(const char *str1, const char *str2) 把 str1 所指向的字符串和 str2 所指向的字符串进行比较。
对于细节要耐心，注释和空格回车等字符的处理，获取下一个字符的时机🥰 图里没有画注释了，因为最后才发现把注释识别放在自动机里特别好用。

my2.c

#include "my2.h"

void error(int n)
{
    printf("Error %3d: %s\n", n, err_msg[n]);
}

void lexer(FILE *fp)
{
    int num = 0;          //当前识别中的数字
    int k = 0;            //当前识别中的数字的长度
    char a[MAXIDLEN + 1]; //当前识别中的标识符or关键字
    int a_index = 0;      //当前识别中的标识符or关键字的下标

    ch = fgetc(fp); //获取文件第一个字符

    while (ch != EOF)
    {
        switch (currState)
        {
        case START:
            if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')
            { //不能在switch外面忽略这些字符，在这里，他们不是无效的，他们可以表示标识符的结束等
            }
            else if (ch == '{')
            { //注释{}
                currState = COMMENT;
            }
            else if (isdigit(ch))
            {
                currState = INNUM;
                num = num * 10 + ch - '0';
                k++;
            }
            else if (isalpha(ch))
            {
                currState = INID;
                if (a_index > MAXIDLEN)
                {
                    error(26);
                    exit(1);
                }
                a[a_index] = ch;
                a_index++;
            }
            else if (ch == ':')
                currState = INBECOMES;
            else if (ch == '>')
                currState = GTR;
            else if (ch == '<')
                currState = LES;
            else
            { //单独字符直接识别
                currState = START;
                int i = 1;
                for (; i <= NSYM; i++)
                {
                    if (ch == csym[i])
                        break;
                }
                if (i <= NSYM)
                {
                    sym = ssym[i];
                    printf("(%d,%c)\n", sym, ch);
                }
                else
                {
                    error(0);
                    // exit(1);
                    printf("the char is ---%c---\n", ch);
                }
            }
            break;
        case INNUM:
            if (isdigit(ch))
            {
                num = num * 10 + ch - '0';
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_NUMBER;
                if (k > MAXNUMLEN)
                    error(25);
                else
                {
                    printf("(%d,%d)\n", sym, num);
                }
                k = 0;
                num = 0;
            }
            break;
        case COMMENT:
            if (ch == '}')
            { // 注释结束
                currState = START;
            }
            break;
        case INID:
            if (isalpha(ch) || isdigit(ch))
            {
                if (a_index > MAXIDLEN)
                {
                    error(26);
                    exit(1);
                }
                a[a_index] = ch;
                a_index++;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                a[a_index] = '\0';   // 字符数组和字符串的区别就是结尾少了\0，一定要加上！
                // 检查是否为关键字
                int i = 1;
                for (; i <= NRW; i++)
                {
                    if (strcmp(a, word[i]) == 0)
                        break;
                }
                if (i <= NRW)
                {
                    sym = wsym[i]; // symbol is a reserved word
                }
                else
                {
                    sym = SYM_IDENTIFIER; // symbol is an identifier
                }
                printf("(%d,%s)\n", sym, a);
                a_index = 0;
            }
            break;
        case INBECOMES:
            if (ch == '=')
            {
                currState = BECOMES;
            }
            else
            {
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_NULL;
            }
            break;
        case GTR:
            if (ch == '=')
            {
                currState = GEQ;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_GTR;
                printf("(%d,>)\n", sym);
            }
            break;
        case LES:
            if (ch == '=')
            {
                currState = LEQ;
            }
            else
            { //token识别完毕
                currState = START;
                ch = ungetc(ch, fp); // 回退该字符，重新识别
                sym = SYM_LES;
                printf("(%d,<)\n", sym);
            }
            break;
        case BECOMES: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_BECOMES;
            printf("(%d,:=)\n", sym);
            break;
        case GEQ: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_GEQ;
            printf("%d,>=\n", sym);
            break;
        case LEQ: //token识别完毕
            currState = START;
            ch = ungetc(ch, fp); // 回退该字符，重新识别
            sym = SYM_LEQ;
            printf("%d,<=\n", sym);
            break;
        }

        //在最后获取下一个字符
        ch = fgetc(fp);
    }
    printf("—————文件读取结束—————");
}

int main()
{
    //获取待检验文件的指针
    FILE *fp = fopen("source.txt", "r");
    if (!fp)
    {
        printf("文件不存在");
    }
    else //将待检验文件放入词法分析器进行分析
        lexer(fp);
    return 0;
}

my2.h

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define NRW        11     // number of reserved words
#define MAXNUMLEN  14     // maximum number of digits in numbers
#define NSYM       10     // maximum number of symbols in array ssym and csym
#define MAXIDLEN   10     // length of identifiers

char ch;         // last character read
int  sym;        // last symbol read

//相比上一篇文章的代码，本头文件中增加了状态state和现态currState
enum state {
	START,INNUM,INID,INBECOMES,BECOMES,GTR,GEQ,NEQ,LES,LEQ,END,COMMENT
};
int currState = START;        //现态

char csym[NSYM + 1] = {
	' ', '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
};

//关键字
char* word[NRW + 1] = {
	"", /* place holder */
	"begin", "call", "const", "do", "end","if",
	"odd", "procedure", "then", "var", "while"
};
//类别码
enum symtype {
	SYM_NULL,	SYM_IDENTIFIER,	SYM_NUMBER,	SYM_PLUS,	SYM_MINUS,	SYM_TIMES,	SYM_SLASH,	SYM_ODD,	SYM_EQU,	SYM_NEQ,	SYM_LES,	SYM_LEQ,	SYM_GTR,	SYM_GEQ,	SYM_LPAREN,	SYM_RPAREN,	SYM_COMMA,	SYM_SEMICOLON,	SYM_PERIOD,	SYM_BECOMES,    SYM_BEGIN,	SYM_END,	SYM_IF,	SYM_THEN,	SYM_WHILE,	SYM_DO,	SYM_CALL,	SYM_CONST,	SYM_VAR,	SYM_PROCEDURE
};
int wsym[NRW + 1] = {
	SYM_NULL, SYM_BEGIN, SYM_CALL, SYM_CONST, SYM_DO, SYM_END,
	SYM_IF, SYM_ODD, SYM_PROCEDURE, SYM_THEN, SYM_VAR, SYM_WHILE
};
int ssym[NSYM + 1] = {//
	SYM_NULL, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH,
	SYM_LPAREN, SYM_RPAREN, SYM_EQU, SYM_COMMA, SYM_PERIOD, SYM_SEMICOLON
};
                                         
//报错信息（相比上一篇文章随意地增加了0、26）
char* err_msg[] =
{
/*  0 */    "Fatal Error:Unknown character.\n",
/*  1 */    "Found ':=' when expecting '='.",
/*  2 */    "There must be a number to follow '='.",
/*  3 */    "There must be an '=' to follow the identifier.",
/*  4 */    "There must be an identifier to follow 'const', 'var', or 'procedure'.",
/*  5 */    "Missing ',' or ';'.",
/*  6 */    "Incorrect procedure name.",
/*  7 */    "Statement expected.",
/*  8 */    "Follow the statement is an incorrect symbol.",
/*  9 */    "'.' expected.",
/* 10 */    "';' expected.",
/* 11 */    "Undeclared identifier.",
/* 12 */    "Illegal assignment.",
/* 13 */    "':=' expected.",
/* 14 */    "There must be an identifier to follow the 'call'.",
/* 15 */    "A constant or variable can not be called.",
/* 16 */    "'then' expected.",
/* 17 */    "';' or 'end' expected.",
/* 18 */    "'do' expected.",
/* 19 */    "Incorrect symbol.",
/* 20 */    "Relative operators expected.",
/* 21 */    "Procedure identifier can not be in an expression.",
/* 22 */    "Missing ')'.",
/* 23 */    "The symbol can not be followed by a factor.",
/* 24 */    "The symbol can not be as the beginning of an expression.",
/* 25 */    "The number is too great.",
/* 26 */    "The identifier is too long",
/* 27 */    "",
/* 28 */    "",
/* 29 */    "",
/* 30 */    "",
/* 31 */    "",
/* 32 */    "There are too many levels."
};

source.txt

const a=10;    {常量声明}
const b=20;
var c;         {变量声明}
procedure p;   {过程声明}
     begin
          c:=b+a
     end;

begin
     call p
end.

感想

断断续续折腾了两周，参考了好多文章，走了山路十八弯😡
看到过一篇把转换表做成二维数组的，字符数*状态数，太多了，就没用，本质和switch是一样的！
DFA太好用了！！！😭 值得！！！
现在也许还有bug，欢迎交流指正~