1
+ import enum
2
+ import sys
3
+
4
+ class LexingError (enum .Enum ):
5
+ UNKNOWN_TOKEN = 1
6
+
7
+ class TokenType (enum .Enum ):
8
+ EOF = - 1
9
+ NEWLINE = 0
10
+ NUMBER = 1
11
+ IDENT = 2
12
+ STRING = 3
13
+ # Keywords.
14
+ LABEL = 101
15
+ GOTO = 102
16
+ PRINT = 103
17
+ INPUT = 104
18
+ LET = 105
19
+ IF = 106
20
+ THEN = 107
21
+ ENDIF = 108
22
+ WHILE = 109
23
+ REPEAT = 110
24
+ ENDWHILE = 111
25
+ FUNC = 112
26
+ RETURN = 113
27
+ # Operators.
28
+ EQ = 201
29
+ PLUS = 202
30
+ MINUS = 203
31
+ ASTERISK = 204
32
+ SLASH = 205
33
+ EQEQ = 206
34
+ NOTEQ = 207
35
+ LT = 208
36
+ LTEQ = 209
37
+ GT = 210
38
+ GTEQ = 211
39
+ LOGNOT = 212
40
+
41
+
42
+ class Token :
43
+ def __init__ (self , tokenText , tokenKind ):
44
+ self .text = tokenText
45
+ self .kind = tokenKind
46
+
47
+ @staticmethod
48
+ def checkIfKeyword (tokenText ):
49
+ for kind in TokenType :
50
+ # Relies on all keyword enum values being 1XX.
51
+ if kind .name == tokenText and kind .value >= 100 and kind .value < 200 :
52
+ return kind
53
+ return None
54
+
55
+ class Lexer :
56
+ def __init__ (self , _input ):
57
+ self .source = _input + '\n ' # Newline simplifies last statement
58
+ self .curChar = '' # Current char in the input
59
+ self .curPos = - 1 # Current position in the string
60
+ self .nextChar ()
61
+
62
+
63
+ # Process next character in input
64
+ def nextChar (self ):
65
+ self .curPos += 1
66
+ if self .curPos >= len (self .source ):
67
+ self .curChar = '\0 ' # null terminator EOF char
68
+ else :
69
+ self .curChar = self .source [self .curPos ]
70
+
71
+ # Return the next character in a look-ahead
72
+ def peek (self ):
73
+ if self .curPos + 1 >= len (self .source ):
74
+ return '\0 '
75
+ else :
76
+ return self .source [self .curPos + 1 ]
77
+
78
+ # Crash if a token is invalid.
79
+ def abort (self , error ):
80
+ message = f"Unknown error. 0x{ 0 :x} "
81
+ if error == LexingError .UNKNOWN_TOKEN :
82
+ message = f"Unknown token error (0x1) at position { self .curPos } : '{ self .curChar } '."
83
+
84
+ sys .exit ("LEXING ERROR:\n " + message )
85
+
86
+ # Skip whitespace (except newlines, which are not ignored and indicate end-of-statement)
87
+ def skipWhitespace (self ):
88
+ while self .curChar == ' ' or self .curChar == '\t ' or self .curChar == '\r ' :
89
+ self .nextChar ()
90
+
91
+ # Skip comments (delimited by #)
92
+ def skipComment (self ):
93
+ if self .curChar == '#' :
94
+ while self .curChar != '\n ' :
95
+ self .nextChar ()
96
+
97
+ # Return the next token
98
+ def getToken (self ):
99
+ self .skipWhitespace ()
100
+ self .skipComment ()
101
+ token = None
102
+
103
+ if self .curChar == '+' :
104
+ token = Token (self .curChar , TokenType .PLUS )
105
+ elif self .curChar == '-' :
106
+ token = Token (self .curChar , TokenType .MINUS )
107
+ elif self .curChar == '*' :
108
+ token = Token (self .curChar , TokenType .ASTERISK )
109
+ elif self .curChar == '/' :
110
+ token = Token (self .curChar , TokenType .SLASH )
111
+ elif self .curChar == '\n ' :
112
+ token = Token (self .curChar , TokenType .NEWLINE )
113
+ elif self .curChar == '\0 ' :
114
+ token = Token (self .curChar , TokenType .EOF )
115
+ elif self .curChar == '=' :
116
+ # Multiple tokens possible
117
+ if self .peek () == '=' :
118
+ lastChar = self .curChar
119
+ self .nextChar ()
120
+ token = Token (lastChar + self .curChar , TokenType .EQEQ )
121
+ else :
122
+ token = Token (self .curChar , TokenType .EQ )
123
+ elif self .curChar == '>' :
124
+ if self .peek () == '=' :
125
+ lastChar = self .curChar
126
+ self .nextChar ()
127
+ token = Token (lastChar + self .curChar , TokenType .GTEQ )
128
+ else :
129
+ token = Token (self .curChar , TokenType .GT )
130
+ elif self .curChar == '<' :
131
+ if self .peek () == '=' :
132
+ lastChar = self .curChar
133
+ self .nextChar ()
134
+ token = Token (lastChar + self .curChar , TokenType .LTEQ )
135
+ else :
136
+ token = Token (self .curChar , TokenType .LT )
137
+ elif self .curChar == '!' :
138
+ if self .peek () == '=' :
139
+ lastChar = self .curChar
140
+ self .nextChar ()
141
+ token = Token (lastChar + self .curChar , TokenType .NOTEQ )
142
+ else :
143
+ token = Token (self .curChar , TokenType .LOGNOT )
144
+ elif self .curChar == '\" ' :
145
+ # Get characters between quotations.
146
+ self .nextChar ()
147
+ startPos = self .curPos
148
+
149
+ while self .curChar != '\" ' :
150
+ # Don't allow special characters in the string. No escape characters, newlines, tabs, or %.
151
+ # We will be using C's printf on this string.
152
+ if self .curChar == '\r ' or self .curChar == '\n ' or self .curChar == '\t ' or self .curChar == '\\ ' or self .curChar == '%' :
153
+ self .abort ("Illegal character in string." )
154
+ self .nextChar ()
155
+
156
+ tokText = self .source [startPos : self .curPos ] # Get the substring.
157
+ token = Token (tokText , TokenType .STRING )
158
+
159
+ elif self .curChar .isdigit ():
160
+ # Leading character is a digit, so this must be a number.
161
+ # Get all consecutive digits and decimal if there is one.
162
+ startPos = self .curPos
163
+ while self .peek ().isdigit ():
164
+ self .nextChar ()
165
+ if self .peek () == '.' : # Decimal!
166
+ self .nextChar ()
167
+
168
+ # Must have at least one digit after decimal.
169
+ if not self .peek ().isdigit ():
170
+ # Error!
171
+ self .abort ("Illegal character in number." )
172
+ while self .peek ().isdigit ():
173
+ self .nextChar ()
174
+
175
+ tokText = self .source [startPos : self .curPos + 1 ] # Get the substring.
176
+ token = Token (tokText , TokenType .NUMBER )
177
+
178
+ elif self .curChar .isalpha ():
179
+ # Leading character is a letter, so this must be an identifier or a keyword.
180
+ # Get all consecutive alpha numeric characters.
181
+ startPos = self .curPos
182
+ while self .peek ().isalnum ():
183
+ self .nextChar ()
184
+
185
+ # Check if the token is in the list of keywords.
186
+ tokText = self .source [startPos : self .curPos + 1 ] # Get the substring.
187
+ keyword = Token .checkIfKeyword (tokText )
188
+ if keyword == None : # Identifier
189
+ token = Token (tokText , TokenType .IDENT )
190
+ else : # Keyword
191
+ token = Token (tokText , keyword )
192
+
193
+ else :
194
+ # Unknown token!
195
+ self .abort (LexingError .UNKNOWN_TOKEN )
196
+
197
+ self .nextChar ()
198
+ return token
0 commit comments