第1关:C语言语法分析器

任务描述

本关任务:用 C/C++ 编写一个 C 语言的语法分析器程序。

相关知识

为了完成本关任务,需要掌握:

  1. DFA NFA

  2. C/C++ 编程语言基础

  3. C 语言的基本结构知识

自动机

在完成本实训前,一定要先设计相关自动机,再开始相关功能的实现。切勿,想到哪里,就编程到哪里,以至于代码一团糟,可维护性与可读性都很差。

C/C++

本实训涉及函数、结构体,标准流输入输出,字符串等操作

C语言基本结构

C 语言子集。 第一类:标识符

第二类:常数

第三类:保留字(32)

第四类:界符 /*// () { }[ ]" "'

第五类:运算符 <<=>>==+-*/^

所有语言元素集合在 c_keys.txt 文件中。

注意,C_key.txt中缺少“//注释”的情况,请也映射到编号79!

编程要求

请仔细阅读该部分

输入

样例输入放在prog.txt文件中

样例1输入

int main(){
	printf("HelloWorld");
	return 0;      
}  

输出

输出要满足以下要求

计数: <符号名,符号标号>  

注意,冒号后边有一个空格

样例1输出

1: <int,17>  
2: <main,81>  
3: <(,44>  
4: <),45>  
5: <{,59>  
6: <printf,81>  
7: <(,44>  
8: <",78>  
9: <HelloWorld,81>  
10: <",78>  
11: <),45>  
12: <;,53>  
13: <return,20>  
14: <0,80>  
15: <;,53>  
16: <},63>  

注意,输出不能有多余的空格,回车等符号。请注意样例输出最后一行后是没有回车的!输出的符号都是英文的半角符号。

ERROR

本实训不考虑错误处理,保证输入的所有代码块是合法的 C 语言代码。


开始任务吧,祝成功!

测试集1

测试输入

int main(){
	printf("HelloWorld");
	return 0;
}

预期输出

1: <int,17>
2: <main,81>
3: <(,44>
4: <),45>
5: <{,59>
6: <printf,81>
7: <(,44>
8: <",78>
9: <HelloWorld,81>
10: <",78>
11: <),45>
12: <;,53>
13: <return,20>
14: <0,80>
15: <;,53>
16: <},63>

测试集2

测试输入

int main(){
	int i = 0;// 注释 test 	
	for (i = 0; i != 10; ++i){
    	printf("%d",i);
    } 	
    return 0; 
}

预期输出

1: <int,17> 
2: <main,81> 
3: <(,44> 
4: <),45> 
5: <{,59> 
6: <int,17> 
7: <i,81> 
8: <=,72> 
9: <0,80> 
10: <;,53> 
11: <// 注释 test,79> 
12: <for,14> 
13: <(,44> 
14: <i,81> 
15: <=,72> 
16: <0,80> 
17: <;,53> 
18: <i,81> 
19: <!=,38> 
20: <10,80> 
21: <;,53> 
22: <++,66> 
23: <i,81> 
24: <),45> 
25: <{,59> 
26: <printf,81> 
27: <(,44> 
28: <",78> 
29: <%d,81> 
30: <",78> 
31: <,,48> 
32: <i,81> 
33: <),45> 
34: <;,53> 
35: <},63> 
36: <return,20> 
37: <0,80> 
38: <;,53> 
39: <},63>

测试集3

测试输入

int main(){
    char c = "h";/* 注释 12313test */
    if (c=="h")
        printf("%c",c);
    else print("k");
    return 0;
}

预期输出

1: <int,17>
2: <main,81>
3: <(,44>
4: <),45>
5: <{,59>
6: <char,4>
7: <c,81>
8: <=,72>
9: <",78>
10: <h,81>
11: <",78>
12: <;,53>
13: </* 注释 12313test */,79>
14: <if,16>
15: <(,44>
16: <c,81>
17: <==,73>
18: <",78>
19: <h,81>
20: <",78>
21: <),45>
22: <printf,81>
23: <(,44>
24: <",78>
25: <%c,81>
26: <",78>
27: <,,48>
28: <c,81>
29: <),45>
30: <;,53>
31: <else,10>
32: <print,81>
33: <(,44>
34: <",78>
35: <k,81>
36: <",78>
37: <),45>
38: <;,53>
39: <return,20>
40: <0,80>
41: <;,53>
42: <},63>

测试集4

测试输入

bool gofor(char& ch, string& pos, const string& prog){
	++pos;
	if (pos >= prog.size())	{
		return false;
	}else{
		ch = prog[pos];
		return true;
	}
}

预期输出

1: <bool,81>
2: <gofor,81>
3: <(,44>
4: <char,4>
5: <&,41>
6: <ch,81>
7: <,,48>
8: <string,81>
9: <&,41>
10: <pos,81>
11: <,,48>
12: <const,5>
13: <string,81>
14: <&,41>
15: <prog,81>
16: <),45>
17: <{,59>
18: <++,66>
19: <pos,81>
20: <;,53>
21: <if,16>
22: <(,44>
23: <pos,81>
24: <>=,75>
25: <prog,81>
26: <.,49>
27: <size,81>
28: <(,44>
29: <),45>
30: <),45>
31: <{,59>
32: <return,20>
33: <false,81>
34: <;,53>
35: <},63>
36: <else,10>
37: <{,59>
38: <ch,81>
39: <=,72>
40: <prog,81>
41: <[,55>
42: <pos,81>
43: <],56>
44: <;,53>
45: <return,20>
46: <true,81>
47: <;,53>
48: <},63>
49: <},63>

代码文件

LexAnalysis.h

// C语言词法分析器
#include <cstdio>
#include <cstring>
#include <iostream>
#include <map>
#include <string>
#include <fstream>
#include <sstream>
#include <vector>
using namespace std;
/* 不要修改这个标准输入函数 */
void read_prog(string& prog)
{
	char c;
	while(scanf("%c",&c)!=EOF){
		prog += c;
	}
}
/* 可以添加其他函数 */

void Analysis()
{
	string prog;
	read_prog(prog);
	/* 请开始 */
    /********* Begin *********/
    
    
    /********* End *********/
	
}

C_LexAnalysis_mainProcess.cpp

#include "LexAnalysis.h"
int main()
{
	Analysis(); 
 	return 0;
}

c_keys.txt

auto    1
break    2
case    3
char    4
const    5
continue    6
default    7
do    8
double    9
else    10
enum    11
extern    12
float    13
for    14
goto    15
if    16
int    17
long    18
register    19
return    20
short    21
signed    22
sizeof    23
static    24
struct    25
switch    26
typedef    27
union    28
unsigned    29
void    30
volatile    31
while    32
-    33
--    34
-=    35
->    36
!    37
!=    38
%    39
%=    40
&    41
&&    42
&=    43
(    44
)    45
*    46
*=    47
,    48
.    49
/    50
/=    51
:    52
;    53
?    54
[    55
]    56
^    57
^=    58
{    59
|    60
||    61
|=    62
}    63
~    64
+    65
++    66
+=    67
<    68
<<    69
<<=    70
<=    71
=    72
==    73
>    74
>=    75
>>    76
>>=    77
"    78
/*注释*/    79
常数    80
标识符    81

Code

C_LexAnalysis_mainProcess.cpp

int main()

#include "LexAnalysis.h"
int main()
{
    Analysis();
    return 0;
}

LexAnalysis.h

void Analysis()

void Analysis()
{
    //string prog;
    read_prog();

    /* 骚年们 请开始你们的表演 */
    /********* Begin *********/
    LexAnalysis();
    printList(listOfAllToken);
    /********* End *********/

}

void read_prog()

/* 不要修改这个标准输入函数 */
void read_prog(){
    char c;
    while(scanf("%c",&c)!=EOF){
        prog += c;
    }
}

void LexAnalysis()

void LexAnalysis() {
    int pos;//位置
    int cur_cnt = 1;//计数

    for (pos = 0; pos < prog.length(); pos++) {
        while (prog[pos] != ' ' && pos < prog.length()) {
            switch (startCharType(prog[pos])) {
                case 1:
                    pos = numToken(pos, cur_cnt);//数字开头
                    cur_cnt++;
                    break;
                case 2:
                    pos = alphaToken(pos, cur_cnt);//字母开头
                    cur_cnt++;
                    break;
                case 3:
                    pos = delimiterToken(pos, cur_cnt);//界符开头
                    cur_cnt++;
                    break;
                case 4:
                    pos = operatorToken(pos, cur_cnt);//运算符开头
                    cur_cnt++;
                    break;
                case 5:
                    pos++;
                    break;
                default:
                    pos++;
                    break;
            }
        }
    }
}
int startCharType()
/**通过单词的首个字符,分析输入词的类型*/
int startCharType(char ch){
    int type ;
    if (isDigit(ch)){
        type = 1;
    }else{
        if (isAlpha(ch)){
            type = 2;
        }else{
            if (isDelimiter(ch)){
                type = 3;
            }else{
                if (isOperator(ch)){
                    type = 4;
                }else {
                    if (ch == '\n'){
                        type = 6;
                    }else {
                        type = 5;
                    }
                }
            }
        }
    }
    return type;
}
bool isDigit()
/**是否数字 → 80 常数 */
bool isDigit(char ch) {
    if (ch >= '0' && ch <= '9') {
        return true;
    }
    return false;
}
bool isAlpha()
/**是否字母 → 1-32 保留字 & 81 标识符*/
bool isAlpha(char ch) {
    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'){
        return true;
    }
    return false;
}
bool isDelimiter()
/**是否界符 → 44-45、48-49、52-53、55-56、59、63 界符*/
bool isDelimiter(char ch) {
    char delimiter[11]={
            '(', ')',//44 45
            ',', '.',//48 49
            ':', ';',//52 53
            '[', ']',//55 56
            '{', '}',//59 63
            '\"'//78
    };

    for (char i : delimiter) {
        if (ch == i){
            return true;
        }
    }
    return false;
}
bool isOperator()
/**是否运算符 → 33-43、46-47、50-51、54、57-58、60-62、64-77 运算符*/
bool isOperator(char ch){
    char operators[14]={
            '-', '+','*','/', '%','=',
            '>', '<',
            '!', '?',
            '&','^', '|','~'
    };

    for (char i : operators) {
        if (ch == i){
            return true;
        }
    }
    return false;
}
int numToken()
/**数字开头*/
int numToken(int pos, int cur_cnt){
    token num_token;
    string num_token_value;

    num_token_value += prog[pos++];

    //数字类型
    while(isDigit(prog[pos]) || prog[pos] == '.'){
        num_token_value += prog[pos++];
    }

    //生成数字类型token结点并插入
    num_token.setToken(num_token_value, cur_cnt,80);
    insertIntoList(listOfAllToken, num_token);

    //返回分析进度最新位置
    return pos;
}
int alphaToken()
/**字母开头*/
int alphaToken(int pos, int cur_cnt){
    token alpha_token;
    string alpha_token_value;
    alpha_token_value += prog[pos++];

    //后面字符是字母或数字
    while(isAlpha(prog[pos]) || isDigit(prog[pos])){
        alpha_token_value += prog[pos++];
    }

    //查表,若不是保留字则是标识符
    if(isKeyword(alpha_token_value)){
        alpha_token.setToken(alpha_token_value, cur_cnt, labKeyword(alpha_token_value)+1);
    }else{
        alpha_token.setToken(alpha_token_value, cur_cnt, 81);
    }

    insertIntoList(listOfAllToken, alpha_token);

    return pos;
}
bool isKeyword()
bool isKeyword(const string& token) {

    string KeyWord[32] = {
            "auto", "break", "case", "char", "const", "continue",
            "default", "do", "double", "else", "enum", "extern",
            "float", "for", "goto", "if", "int", "long",
            "register", "return", "short", "signed", "sizeof", "static",
            "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while"
    };

    for (const string& a : KeyWord){
        if (token == a){
            return true;
        }
    }

    return false;
}
int labKeyword()
int labKeyword(const string& token) {

    string KeyWord[32] = {
            "auto", "break", "case", "char", "const", "continue",
            "default", "do", "double", "else", "enum", "extern",
            "float", "for", "goto", "if", "int", "long",
            "register", "return", "short", "signed", "sizeof", "static",
            "struct", "switch", "typedef", "union", "unsigned", "void",
            "volatile", "while"
    };

    for (int lab=0;lab<32;lab++){
        if (token == KeyWord[lab]){
            return lab;
        }
    }
    return 0;
}
int delimiterToken()
/**界符开头*/
int delimiterToken(int pos, int cur_cnt){
    token delimiter_token;
    string delimiter_token_value;

    delimiter_token_value += prog[pos++];

    //生成界符类型token结点并插入
    delimiter_token.setToken( delimiter_token_value, cur_cnt,labDelimiter(delimiter_token_value));
    insertIntoList(listOfAllToken, delimiter_token);

    return pos;
}
int labDelimiter()
int labDelimiter(const string& token) {
    if (token == "("){
        return 44;
    }else if (token == ")"){
        return 45;
    }else if (token == ","){
        return 48;
    }else if (token == "."){
        return 49;
    }else if (token == ":"){
        return 52;
    }else if (token == ";"){
        return 53;
    }else if (token == "["){
        return 55;
    }else if (token == "]"){
        return 56;
    }else if (token == "{"){
        return 59;
    }else if (token == "}"){
        return 63;
    }else if (token == "\""){
        return 78;
    }
    return 0;
}
int operatorToken()
int operatorToken(int pos, int cur_cnt){
    token operator_token;
    string operator_token_value;

    if (prog[pos] == '-'){
        if(prog[pos + 1] == '-' ){
            operator_token_value = "--";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,34);
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = "-=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,35);
        }else if(prog[pos + 1] == '>' ){
            operator_token_value = "->";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,36);
        }else{
            operator_token_value = "-";
            operator_token.setToken(operator_token_value, cur_cnt ,33);
        }
    }else if (prog[pos] == '!'){
        if(prog[pos + 1] == '=' ){
            operator_token_value = "!=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,38);
        }else{
            operator_token_value = "!";
            operator_token.setToken(operator_token_value, cur_cnt ,37);
        }
    }else if (prog[pos] == '%'){
        /**PS.需要考虑%d等情况*/
        if(prog[pos + 1] == '=' ){
            operator_token_value = "%=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,40);
        }else if(isAlpha(prog[pos + 1])){
            operator_token_value += "%";
            pos++;

            operator_token_value += prog[pos++];

            //后面字符是字母
            while(isAlpha(prog[pos]) ){
                operator_token_value += prog[pos++];
            }

            operator_token.setToken(operator_token_value, cur_cnt ,81);
            pos--;
        }else{
            operator_token_value = "%";
            operator_token.setToken(operator_token_value, cur_cnt ,39);
        }
    }else if (prog[pos] == '&'){
        if(prog[pos + 1] == '&' ){
            operator_token_value = "&&";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,42);
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = "&=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,43);
        }else{
            operator_token_value = "&";
            operator_token.setToken(operator_token_value, cur_cnt ,41);
        }
    }else if (prog[pos] == '*'){
        if(prog[pos + 1] == '=' ){
            operator_token_value = "*=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,47);
        }else{
            operator_token_value = "*";
            operator_token.setToken(operator_token_value, cur_cnt ,46);
        }
    }else if (prog[pos] == '/'){
        //// 注释 test
        /**PS.需要考虑单行/多行注释等情况*/
        if(prog[pos + 1] == '=' ){
            operator_token_value = "/=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,51);
        }else if(prog[pos + 1] == '/' ){
            operator_token_value = "//";
            pos += 2;
            while (prog[pos] != '\n') {
                operator_token_value += prog[pos];
                pos++;
            }
            operator_token.setToken(operator_token_value, cur_cnt ,79);
        }else if(prog[pos + 1] == '*' ){
            operator_token_value = "/*";
            pos+=2;
            while (!(prog[pos] == '*' && prog[pos + 1] == '/')) {
                //判断注释符是否合拢
                if (prog[pos] == '\0') {
                    cout << "annotation error!" << endl;
                    exit(0);
                }
                operator_token_value += prog[pos];
                ++pos;
            }
            operator_token_value += "*/";
            pos += 2;
            operator_token.setToken(operator_token_value, cur_cnt ,79);
        }else{
            operator_token_value = "/";
            operator_token.setToken(operator_token_value, cur_cnt ,50);
        }
        /**79 注释*/
/*
        if (str[pos] != '\v' && str[pos] != '\t') {
            tmp += str[pos];
        }
    }
}*/
    }else if (prog[pos] == '?'){
        operator_token_value = "?";
        operator_token.setToken(operator_token_value, cur_cnt ,54);
    }else if (prog[pos] == '^'){
        if(prog[pos + 1] == '=' ){
            operator_token_value = "^=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,58);
        }else{
            operator_token_value = "^";
            operator_token.setToken(operator_token_value, cur_cnt ,57);
        }
    }else if (prog[pos] == '|'){
        if(prog[pos + 1] == '|' ){
            operator_token_value = "||";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,61);
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = "|=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,62);
        }else{
            operator_token_value = "|";
            operator_token.setToken(operator_token_value, cur_cnt ,60);
        }
    }else if (prog[pos] == '~'){
        operator_token_value = "~";
        operator_token.setToken(operator_token_value, cur_cnt ,64);
    }else if (prog[pos] == '+'){
        if(prog[pos + 1] == '+' ){
            operator_token_value = "++";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,66);
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = "+=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,67);
        }else{
            operator_token_value = "+";
            operator_token.setToken(operator_token_value, cur_cnt ,65);
        }
    }else if (prog[pos] == '<'){
        if(prog[pos + 1] == '<' ){
            pos++;
            if(prog[pos + 2] == '=' ){
                operator_token_value = "<<=";
                pos++;
                operator_token.setToken(operator_token_value, cur_cnt ,70);
            }else {
                operator_token_value = "<<";
                pos++;
                operator_token.setToken(operator_token_value, cur_cnt ,69);
            }
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = "<=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,71);
        }else{
            operator_token_value = "<";
            operator_token.setToken(operator_token_value, cur_cnt ,68);
        }
    }else if (prog[pos] == '='){
        if(prog[pos + 1] == '=' ){
            operator_token_value = "==";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,73);
        }else{
            operator_token_value = "=";
            operator_token.setToken(operator_token_value, cur_cnt ,72);
        }
    }else if (prog[pos] == '>'){
        if(prog[pos + 1] == '>' ){
            pos++;
            if(prog[pos + 2] == '=' ){
                operator_token_value = ">>=";
                pos++;
                operator_token.setToken(operator_token_value, cur_cnt ,77);
            }else {
                operator_token_value = ">>";
                pos++;
                operator_token.setToken(operator_token_value, cur_cnt ,76);
            }
        }else if(prog[pos + 1] == '=' ){
            operator_token_value = ">=";
            pos++;
            operator_token.setToken(operator_token_value, cur_cnt ,75);
        }else{
            operator_token_value = ">";
            operator_token.setToken(operator_token_value, cur_cnt ,74);
        }
    }
    pos++;
    //生成操作符类型token结点并插入
    insertIntoList(listOfAllToken, operator_token);

    //返回分析进度最新位置
    return pos;
}

void printList(tokenList list)

/**链表输出操作*/
void printList(tokenList list){
    tokenList *p = &list;//指向 list 的指针

    //开始输出
    while (p){
        //输出token,以 计数: <符号名,符号标号> 的形式
        cout << p->Token.cnt << ": <" << p->Token.str << "," << p->Token.lab << ">" << endl;
        p = p->next;
    }
}
class tokenList {}
/**存储所有 Token 的链表结构*/
class tokenList {
public:
    token Token;
    tokenList *next{};
};
class token {}
/**存储 token 信息的结构体*/
class token {
public:
    int cnt=0;//计数/行数
    string str;//符号名
    int lab=0;//符号标号

    void setToken(string newStr, int curCnt, int newLab){
        this->str = std::move(newStr);//符号名
        this->cnt = curCnt;//计数/行数
        this->lab = newLab;//符号标号
    }
};