编译原理实验一：源程序的预处理及词法分析程序的设计与实现（python）

实验目的

设计并实现一个包含预处理功能的词法分析程序，加深对编译中词法分析过程的理解。

实验要求

1、实现预处理功能

源程序中可能包含有对程序执行无意义的符号，要求将其剔除。
首先编制一个源程序的输入过程，从键盘、文件或文本框输入若干行语句，依次存入输入缓冲区（字符型数据）；然后编制一个预处理子程序，去掉输入串中的回车符、换行符和跳格符等编辑性文字；把多个空白符合并为一个；去掉注释。

2、实现词法分析功能

输入：所给文法的源程序字符串。
输出：二元组（syn,token或sum）构成的序列。其中，
syn为单词种别码。
Token为存放的单词自身字符串。
Sum为整型常量。
具体实现时，可以将单词的二元组用结构进行处理。

3、待分析的C语言子集的词法(可以自行扩充，也可以按照C语言的词法定义)

1）关键字
main if then while do static int double struct break else long switch case typedef char return const float short continue for void default sizeof do
所有的关键字都是小写。
2）运算符和界符
+ - * / : := < <> <= > >= = ; ( ) #
3）其他标记ID和NUM
通过以下正规式定义其他标记：
ID→letter(letter|digit)*
NUM→digit digit*
letter→a|…|z|A|…|Z
digit→0|…|9…
4）空格由空白、制表符和换行符组成
空格一般用来分隔ID、NUM、专用符号和关键字，词法分析阶段通常被忽略。

4、各种单词符号对应的种别码

题目提供的种别码有部分错误，实际要求是可自行定义，这里提供一份参考:

keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
1
2
3
4
5
6
7
8
9
10

5、词法分析程序的主要算法思想

算法的基本任务是从字符串表示的源程序中识别出具有独立意义的单词符号，其基本思想是根据扫描到的单词符号的第一个字符的种类，拼出相应的单词符号。

本程序是从test.txt中读取需要分析的数据

代码：

import re
def pre(file):
    data = file.read()
    out =[]
    data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)
    data = re.sub(re.compile('//[\s\S]*?\n'), "", data)
    data = data.split("\n")
    for i in data:
        i = i.strip(' ').replace('\t', '')
        i = ' '.join(str(i).split())
        if(i!=''):
            out.append(i)
    return out
keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
with open('test.txt', 'r') as file:
    data = pre(file)
    #print(data)
    for i in range(len(data)):
        pattern1 = re.compile('[a-zA-Z.0-9]+')
        line = re.findall(pattern1,data[i])
        for j in line:
            if j in keyword:
                print(j+' -> '+str(keyword[j]))
            elif str(j).isdigit():
                print("'"+str(j)+"' -> 26")
            else:
                j = str(j).strip('.')
                print("'"+j+"' -> 25")
        line2 = re.sub(pattern1," ",data[i])
        line2=line2.split(" ")
        for j in line2:
            if j in operator:
                print(j+' -> '+str(operator[j]))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

测试文本：

#include
struct student
{
    int id;
    long int counts;
    /*

asfagsaf

    */
    /* data */
};
student stu[2000000];
int main(){
    for(long int i=0;i<2000000;i++){
        stu[i].id=i;
        stu[i].counts=0;
    }
    long int n,m;
    int a;
    scanf("%d",&n);
    for(long int i=0;i<n;i++){
        scanf("%ld",&a);
        stu[a].counts++;
    }
    scanf("%ld",&m);
    for(long int i=0;i<m;i++){
        scanf("%d",&a);
        if(stu[a].counts==0){
            printf("NO\n");
        }
        else{
            printf("YES\n");
        }
    }
    return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

输出：

include -> 44
stdio.h -> 40
# -> 0
< -> 33
> -> 36
struct -> 9
'student' -> 25
{ -> 46
int -> 7
'id' -> 25
; -> 41
long -> 12
int -> 7
'counts' -> 25
; -> 41
'student' -> 25
'stu' -> 25
'2000000' -> 26
int -> 7
main -> 1
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'2000000' -> 26
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
'stu' -> 25
'i' -> 25
'id' -> 25
'i' -> 25
= -> 38
; -> 41
'stu' -> 25
'i' -> 25
'counts' -> 25
'0' -> 26
= -> 38
; -> 41
} -> 47
long -> 12
int -> 7
'n' -> 25
'm' -> 25
; -> 41
int -> 7
'a' -> 25
; -> 41
scanf -> 48
'd' -> 25
'n' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'n' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'ld' -> 25
'a' -> 25
'stu' -> 25
'a' -> 25
'counts' -> 25
} -> 47
scanf -> 48
'ld' -> 25
'm' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'm' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'd' -> 25
'a' -> 25
if -> 2
'stu' -> 25
'a' -> 25
'counts' -> 25
'0' -> 26
( -> 42
printf -> 49
'NO' -> 25
'n' -> 25
} -> 47
else -> 11
{ -> 46
printf -> 49
'YES' -> 25
'n' -> 25
} -> 47
} -> 47
return -> 17
'0' -> 26
; -> 41
} -> 47
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

在这里插入图片描述

注意要点：

预处理文件后，从数据中提取单词和符号的过程使用了正则表达式，为[a-zA-Z.0-9]+，这里会出现一个小bug，算是逻辑漏洞，就是面向比如stu[i].counts=0;这样的语句，并没有根据.把语句前后分开，而是把.合并到了.counts里。解决方法是在输出的时候把.去掉。
预处理文件用到的正则表达式：/\*{1,2}[\s\S]*?\*/，目的是去除多行注释，//[\s\S]*?\n目的是去除单行注释。
目前的输出似乎没有按照关键词出现顺序，而是按照先关键词，后符号的顺序输出，这点应该需要完善一下

更正：

后期对中间过程做了一些补充和完善，让分析的输出顺序按照先后顺序进行。

import re
def pre(file):
    data = file.read()
    out =[]
    data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)
    data = re.sub(re.compile('//[\s\S]*?\n'), "", data)
    data = data.split("\n")
    for i in data:
        i = i.strip(' ').replace('\t', '')
        i = ' '.join(str(i).split())
        if(i!=''):
            out.append(i)
    return out
keyword = {'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {'+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
with open('test2.txt', 'r') as file:
    data = pre(file)
    print("\n文本预处理后的内容：\n")
    print(data)
    print()
    for i in range(len(data)):
        line = re.split(r"([a-zA-Z.0-9]+)",data[i])
        for j in line:
            if j == '' or j == ' ':
                continue
            if j == '\\' and line[line.index(j) + 1] == 'n':
                line[line.index(j) + 1] = ''
                continue
            if j in keyword:
                print('('+str(keyword[j])+','+j+')')
            elif str(j).isdigit():
                print("(26,'"+str(j)+"')")
            elif j in operator:
                print('(' + str(operator[j])+','+j+')')
            elif j.isalpha():
                j = str(j).strip('.')
                print("(25,'"+j+"')")
            else:
                temp = str(j)
                for t in temp:
                    if t in operator:
                        print('(' + str(operator[t])+','+t+')')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

主要操作是更改了对于正则表达式的运用，原先是利用表达式提取，现在是利用表达式分割字符串，所以效果更好。
输出：
样例1：
在这里插入图片描述样例2：

相关阅读:
【Java基础】23.接口
 mongoDB数据库
 矩阵键盘的扫描原理与基础应用
 【和小白一起练习CTF】攻防世界：web基础练习题(1)
react的高阶组件怎么用？
SpringCloud OpenFeign token中转
 【场景化解决方案】项目管理结合酷应用，让企业项目管理更便捷高效
 5个物联网商业案例及其带给我们的启示
 [操作系统笔记]处理机调度
 【数据库】Mybatis底层原理
原文地址：https://blog.csdn.net/qq_51594676/article/details/127983533