1 | unit Tokenizer;
|
---|
2 |
|
---|
3 | interface
|
---|
4 |
|
---|
5 | uses
|
---|
6 | Classes, SysUtils;
|
---|
7 |
|
---|
8 | type
|
---|
9 | { TTokenizerPos }
|
---|
10 |
|
---|
11 | TTokenizerPos = record
|
---|
12 | Index: Integer;
|
---|
13 | Pos: TPoint;
|
---|
14 | procedure Increment;
|
---|
15 | procedure IncrementLine;
|
---|
16 | end;
|
---|
17 |
|
---|
18 | TTokenKind = (tkUnknown, tkIdentifier, tkSpecialSymbol, tkNumber, tkString,
|
---|
19 | tkKeyword);
|
---|
20 |
|
---|
21 | TToken = record
|
---|
22 | Kind: TTokenKind;
|
---|
23 | Text: string;
|
---|
24 | function Create(Kind: TTokenKind; Text: string): TToken;
|
---|
25 | end;
|
---|
26 |
|
---|
27 | TErrorEvent = procedure (Pos: TPoint; Text: string) of object;
|
---|
28 |
|
---|
29 | TTokenizerState = (tsNone, tsIdentifier, tsString, tsStringEnd, tsNumber,
|
---|
30 | tsSpecialSymbol, tsLineComment);
|
---|
31 |
|
---|
32 | { TTokenizer }
|
---|
33 |
|
---|
34 | TTokenizer = class
|
---|
35 | private
|
---|
36 | FOnError: TErrorEvent;
|
---|
37 | State: TTokenizerState;
|
---|
38 | public
|
---|
39 | Pos: TTokenizerPos;
|
---|
40 | Source: string;
|
---|
41 | function IsAlpha(C: Char): Boolean;
|
---|
42 | function IsNumeric(C: Char): Boolean;
|
---|
43 | function IsAlphaNumeric(C: Char): Boolean;
|
---|
44 | function IsWhiteSpace(C: Char): Boolean;
|
---|
45 | function IsSpecialSymbol(C: Char): Boolean;
|
---|
46 | function IsSpecialSymbol2(Text: string): Boolean;
|
---|
47 | function IsIdentifier(Text: string): Boolean;
|
---|
48 | function IsOperator(Text: string): Boolean;
|
---|
49 | function IsKeyword(Text: string): Boolean;
|
---|
50 | procedure Init;
|
---|
51 | function GetNext: TToken;
|
---|
52 | function CheckNext(Text: string; Kind: TTokenKind): Boolean;
|
---|
53 | function CheckNextAndRead(Text: string; Kind: TTokenKind): Boolean;
|
---|
54 | function CheckNextKind(Kind: TTokenKind): Boolean;
|
---|
55 | function CheckNextKindAndRead(Kind: TTokenKind): Boolean;
|
---|
56 | procedure Expect(Text: string; Kind: TTokenKind);
|
---|
57 | procedure Error(Text: string);
|
---|
58 | property OnError: TErrorEvent read FOnError write FOnError;
|
---|
59 | end;
|
---|
60 |
|
---|
61 |
|
---|
62 | implementation
|
---|
63 |
|
---|
64 | resourcestring
|
---|
65 | SUnknownToken = 'Unknown token %s';
|
---|
66 | SUnsupportedTokenizerState = 'Unsupported tokenizer state.';
|
---|
67 | SExpectedButFound = 'Expected %s but %s found.';
|
---|
68 |
|
---|
69 | { TToken }
|
---|
70 |
|
---|
71 | function TToken.Create(Kind: TTokenKind; Text: string): TToken;
|
---|
72 | begin
|
---|
73 | Result.Kind := Kind;
|
---|
74 | Result.Text := Text;
|
---|
75 | end;
|
---|
76 |
|
---|
77 | { TTokenizerPos }
|
---|
78 |
|
---|
79 | procedure TTokenizerPos.Increment;
|
---|
80 | begin
|
---|
81 | Inc(Index);
|
---|
82 | Inc(Pos.X);
|
---|
83 | end;
|
---|
84 |
|
---|
85 | procedure TTokenizerPos.IncrementLine;
|
---|
86 | begin
|
---|
87 | Inc(Pos.Y);
|
---|
88 | Pos.X := 1;
|
---|
89 | Inc(Index);
|
---|
90 | end;
|
---|
91 |
|
---|
92 | { TTokenizer }
|
---|
93 |
|
---|
94 | function TTokenizer.IsAlpha(C: Char): Boolean;
|
---|
95 | begin
|
---|
96 | Result := (C in ['A'..'Z']) or (C in ['a'..'z'])
|
---|
97 | end;
|
---|
98 |
|
---|
99 | function TTokenizer.IsNumeric(C: Char): Boolean;
|
---|
100 | begin
|
---|
101 | Result := C in ['0'..'9'];
|
---|
102 | end;
|
---|
103 |
|
---|
104 | function TTokenizer.IsAlphaNumeric(C: Char): Boolean;
|
---|
105 | begin
|
---|
106 | Result := IsAlpha(C) or IsNumeric(C);
|
---|
107 | end;
|
---|
108 |
|
---|
109 | function TTokenizer.IsWhiteSpace(C: Char): Boolean;
|
---|
110 | begin
|
---|
111 | Result := (C = ' ') or (C = #9);
|
---|
112 | end;
|
---|
113 |
|
---|
114 | function TTokenizer.IsSpecialSymbol(C: Char): Boolean;
|
---|
115 | begin
|
---|
116 | Result := (C = ';') or (C = '.') or (C = '(') or (C = ')') or (C = '=') or
|
---|
117 | (C = ':') or (C = '+') or (C = '-') or (C = ',') or (C = '/') or
|
---|
118 | (C = '<') or (C = '>') or (C = '*');
|
---|
119 | end;
|
---|
120 |
|
---|
121 | function TTokenizer.IsSpecialSymbol2(Text: string): Boolean;
|
---|
122 | begin
|
---|
123 | Result := (Text = ':=') or (Text = '//') or (Text = '<>') or (Text = '<=') or
|
---|
124 | (Text = '>=');
|
---|
125 | end;
|
---|
126 |
|
---|
127 | function TTokenizer.IsIdentifier(Text: string): Boolean;
|
---|
128 | var
|
---|
129 | I: Integer;
|
---|
130 | begin
|
---|
131 | Result := True;
|
---|
132 | for I := 1 to Length(Text) do begin
|
---|
133 | if (I = 1) then begin
|
---|
134 | if not IsAlpha(Text[I]) then begin
|
---|
135 | Result := False;
|
---|
136 | Break;
|
---|
137 | end;
|
---|
138 | end else begin
|
---|
139 | if not IsAlphaNumeric(Text[I]) then begin
|
---|
140 | Result := False;
|
---|
141 | Break;
|
---|
142 | end;
|
---|
143 | end;
|
---|
144 | end;
|
---|
145 | end;
|
---|
146 |
|
---|
147 | function TTokenizer.IsOperator(Text: string): Boolean;
|
---|
148 | begin
|
---|
149 | Result := (Text = '+') or (Text = '-') or (Text = '=') or (Text = '<>') or
|
---|
150 | (Text = '*') or (Text = '/') or (Text = 'div') or (Text = '<=') or
|
---|
151 | (Text = '>=') or (Text = 'mod') or (Text = 'shl') or (Text = 'shr') or
|
---|
152 | (Text = 'and') or (Text = 'or') or (Text = 'xor') or (Text = 'not') or
|
---|
153 | (Text = '>') or (Text = '<');
|
---|
154 | end;
|
---|
155 |
|
---|
156 | function TTokenizer.IsKeyword(Text: string): Boolean;
|
---|
157 | begin
|
---|
158 | Result := (Text = 'begin') or (Text = 'end') or (Text = 'program') or
|
---|
159 | (Text = 'var') or (Text = 'const') or (Text = 'if') or (Text = 'then') or
|
---|
160 | (Text = 'else') or (Text = 'while') or (Text = 'do') or (Text = 'for') or
|
---|
161 | (Text = 'to') or (Text = 'repeat') or (Text = 'until') or (Text = 'break') or
|
---|
162 | (Text = 'continue') or (Text = 'function') or (Text = 'procedure');
|
---|
163 | end;
|
---|
164 |
|
---|
165 | procedure TTokenizer.Init;
|
---|
166 | begin
|
---|
167 | Pos.Index := 1;
|
---|
168 | Pos.Pos := Point(1, 1);
|
---|
169 | end;
|
---|
170 |
|
---|
171 | function TTokenizer.GetNext: TToken;
|
---|
172 | var
|
---|
173 | C: Char;
|
---|
174 | begin
|
---|
175 | State := tsNone;
|
---|
176 | Result.Text := '';
|
---|
177 | Result.Kind := tkUnknown;
|
---|
178 | while Pos.Index < Length(Source) do begin
|
---|
179 | C := Source[Pos.Index];
|
---|
180 | if State = tsNone then begin
|
---|
181 | if IsAlpha(C) then begin
|
---|
182 | State := tsIdentifier;
|
---|
183 | Result.Kind := tkIdentifier;
|
---|
184 | Result.Text := C;
|
---|
185 | Pos.Increment;
|
---|
186 | end else
|
---|
187 | if IsWhiteSpace(C) then begin
|
---|
188 | Pos.Increment;
|
---|
189 | end else
|
---|
190 | if C = #10 then begin
|
---|
191 | Pos.IncrementLine;
|
---|
192 | end else
|
---|
193 | if IsSpecialSymbol(C) then begin
|
---|
194 | Result.Kind := tkSpecialSymbol;
|
---|
195 | Result.Text := C;
|
---|
196 | Pos.Increment;
|
---|
197 | State := tsSpecialSymbol;
|
---|
198 | end else
|
---|
199 | if C = '''' then begin
|
---|
200 | Pos.Increment;
|
---|
201 | Result.Kind := tkString;
|
---|
202 | State := tsString;
|
---|
203 | end else
|
---|
204 | if IsNumeric(C) then begin
|
---|
205 | State := tsNumber;
|
---|
206 | Result.Kind := tkNumber;
|
---|
207 | Result.Text := C;
|
---|
208 | Pos.Increment;
|
---|
209 | end else begin
|
---|
210 | Error(Format(SUnknownToken, [C]));
|
---|
211 | Break;
|
---|
212 | end;
|
---|
213 | end else
|
---|
214 | if State = tsIdentifier then begin
|
---|
215 | if IsAlphanumeric(C) then begin
|
---|
216 | Result.Text := Result.Text + C;
|
---|
217 | Pos.Increment;
|
---|
218 | end else begin
|
---|
219 | if IsKeyword(Result.Text) then
|
---|
220 | Result.Kind := tkKeyword;
|
---|
221 | Break;
|
---|
222 | end;
|
---|
223 | end else
|
---|
224 | if State = tsNumber then begin
|
---|
225 | if IsNumeric(C) then begin
|
---|
226 | Result.Text := Result.Text + C;
|
---|
227 | Pos.Increment;
|
---|
228 | end else begin
|
---|
229 | Break;
|
---|
230 | end;
|
---|
231 | end else
|
---|
232 | if State = tsString then begin
|
---|
233 | if C = '''' then begin
|
---|
234 | State := tsStringEnd;
|
---|
235 | Pos.Increment;
|
---|
236 | end else begin
|
---|
237 | Result.Text := Result.Text + C;
|
---|
238 | Pos.Increment;
|
---|
239 | end;
|
---|
240 | end else
|
---|
241 | if State = tsStringEnd then begin
|
---|
242 | if C = '''' then begin
|
---|
243 | State := tsString;
|
---|
244 | Result.Text := Result.Text + C;
|
---|
245 | Pos.Increment;
|
---|
246 | end else begin
|
---|
247 | Break;
|
---|
248 | end;
|
---|
249 | end else
|
---|
250 | if State = tsLineComment then begin
|
---|
251 | if C = #10 then begin
|
---|
252 | State := tsNone;
|
---|
253 | end else Pos.Increment;
|
---|
254 | end else
|
---|
255 | if State = tsSpecialSymbol then begin
|
---|
256 | if IsSpecialSymbol2(Result.Text + C) then begin
|
---|
257 | Result.Text := Result.Text + C;
|
---|
258 | Pos.Increment;
|
---|
259 | if Result.Text = '//' then begin
|
---|
260 | Result.Text := '';
|
---|
261 | State := tsLineComment;
|
---|
262 | end else Break;
|
---|
263 | end else begin
|
---|
264 | Break;
|
---|
265 | end;
|
---|
266 | end else
|
---|
267 | raise Exception.Create(SUnsupportedTokenizerState);
|
---|
268 | end;
|
---|
269 | end;
|
---|
270 |
|
---|
271 | function TTokenizer.CheckNext(Text: string; Kind: TTokenKind): Boolean;
|
---|
272 | var
|
---|
273 | LastPos: TTokenizerPos;
|
---|
274 | Token: TToken;
|
---|
275 | begin
|
---|
276 | LastPos := Pos;
|
---|
277 | Token := GetNext;
|
---|
278 | Result := (Token.Text = Text) and (Token.Kind = Kind);
|
---|
279 | Pos := LastPos;
|
---|
280 | end;
|
---|
281 |
|
---|
282 | function TTokenizer.CheckNextAndRead(Text: string; Kind: TTokenKind): Boolean;
|
---|
283 | var
|
---|
284 | LastPos: TTokenizerPos;
|
---|
285 | Token: TToken;
|
---|
286 | begin
|
---|
287 | LastPos := Pos;
|
---|
288 | Token := GetNext;
|
---|
289 | Result := (Token.Text = Text) and (Token.Kind = Kind);
|
---|
290 | if not Result then Pos := LastPos;
|
---|
291 | end;
|
---|
292 |
|
---|
293 | function TTokenizer.CheckNextKind(Kind: TTokenKind): Boolean;
|
---|
294 | var
|
---|
295 | LastPos: TTokenizerPos;
|
---|
296 | Token: TToken;
|
---|
297 | begin
|
---|
298 | LastPos := Pos;
|
---|
299 | Token := GetNext;
|
---|
300 | Result := Token.Kind = Kind;
|
---|
301 | Pos := LastPos;
|
---|
302 | end;
|
---|
303 |
|
---|
304 | function TTokenizer.CheckNextKindAndRead(Kind: TTokenKind): Boolean;
|
---|
305 | var
|
---|
306 | LastPos: TTokenizerPos;
|
---|
307 | Token: TToken;
|
---|
308 | begin
|
---|
309 | LastPos := Pos;
|
---|
310 | Token := GetNext;
|
---|
311 | Result := Token.Kind = Kind;
|
---|
312 | if not Result then Pos := LastPos;
|
---|
313 | end;
|
---|
314 |
|
---|
315 | procedure TTokenizer.Expect(Text: string; Kind: TTokenKind);
|
---|
316 | var
|
---|
317 | Token: TToken;
|
---|
318 | begin
|
---|
319 | Token := GetNext;
|
---|
320 | if (Token.Text <> Text) or (Token.Kind <> Kind) then
|
---|
321 | Error(Format(SExpectedButFound, [Text, Token.Text]));
|
---|
322 | end;
|
---|
323 |
|
---|
324 | procedure TTokenizer.Error(Text: string);
|
---|
325 | begin
|
---|
326 | if Assigned(OnError) then
|
---|
327 | OnError(Pos.Pos, Text);
|
---|
328 | end;
|
---|
329 |
|
---|
330 | end.
|
---|
331 |
|
---|