| 1 | unit Tokenizer;
|
|---|
| 2 |
|
|---|
| 3 | interface
|
|---|
| 4 |
|
|---|
| 5 | uses
|
|---|
| 6 | Classes, SysUtils;
|
|---|
| 7 |
|
|---|
| 8 | type
|
|---|
| 9 | { TTokenizerPos }
|
|---|
| 10 |
|
|---|
| 11 | TTokenizerPos = record
|
|---|
| 12 | Index: Integer;
|
|---|
| 13 | Pos: TPoint;
|
|---|
| 14 | procedure Increment;
|
|---|
| 15 | procedure IncrementLine;
|
|---|
| 16 | end;
|
|---|
| 17 |
|
|---|
| 18 | TTokenKind = (tkUnknown, tkIdentifier, tkSpecialSymbol, tkNumber, tkString,
|
|---|
| 19 | tkKeyword);
|
|---|
| 20 |
|
|---|
| 21 | TToken = record
|
|---|
| 22 | Kind: TTokenKind;
|
|---|
| 23 | Text: string;
|
|---|
| 24 | function Create(Kind: TTokenKind; Text: string): TToken;
|
|---|
| 25 | end;
|
|---|
| 26 |
|
|---|
| 27 | TErrorEvent = procedure (Pos: TPoint; Text: string) of object;
|
|---|
| 28 |
|
|---|
| 29 | TTokenizerState = (tsNone, tsIdentifier, tsString, tsStringEnd, tsNumber,
|
|---|
| 30 | tsSpecialSymbol, tsLineComment);
|
|---|
| 31 |
|
|---|
| 32 | { TTokenizer }
|
|---|
| 33 |
|
|---|
| 34 | TTokenizer = class
|
|---|
| 35 | private
|
|---|
| 36 | FOnError: TErrorEvent;
|
|---|
| 37 | State: TTokenizerState;
|
|---|
| 38 | public
|
|---|
| 39 | Pos: TTokenizerPos;
|
|---|
| 40 | Source: string;
|
|---|
| 41 | function IsAlpha(C: Char): Boolean;
|
|---|
| 42 | function IsNumeric(C: Char): Boolean;
|
|---|
| 43 | function IsAlphaNumeric(C: Char): Boolean;
|
|---|
| 44 | function IsWhiteSpace(C: Char): Boolean;
|
|---|
| 45 | function IsSpecialSymbol(C: Char): Boolean;
|
|---|
| 46 | function IsSpecialSymbol2(Text: string): Boolean;
|
|---|
| 47 | function IsIdentifier(Text: string): Boolean;
|
|---|
| 48 | function IsOperator(Text: string): Boolean;
|
|---|
| 49 | function IsKeyword(Text: string): Boolean;
|
|---|
| 50 | procedure Init;
|
|---|
| 51 | function GetNext: TToken;
|
|---|
| 52 | function CheckNext(Text: string; Kind: TTokenKind): Boolean;
|
|---|
| 53 | function CheckNextAndRead(Text: string; Kind: TTokenKind): Boolean;
|
|---|
| 54 | function CheckNextKind(Kind: TTokenKind): Boolean;
|
|---|
| 55 | function CheckNextKindAndRead(Kind: TTokenKind): Boolean;
|
|---|
| 56 | procedure Expect(Text: string; Kind: TTokenKind);
|
|---|
| 57 | procedure Error(Text: string);
|
|---|
| 58 | property OnError: TErrorEvent read FOnError write FOnError;
|
|---|
| 59 | end;
|
|---|
| 60 |
|
|---|
| 61 |
|
|---|
| 62 | implementation
|
|---|
| 63 |
|
|---|
| 64 | resourcestring
|
|---|
| 65 | SUnknownToken = 'Unknown token %s';
|
|---|
| 66 | SUnsupportedTokenizerState = 'Unsupported tokenizer state.';
|
|---|
| 67 | SExpectedButFound = 'Expected %s but %s found.';
|
|---|
| 68 |
|
|---|
| 69 | { TToken }
|
|---|
| 70 |
|
|---|
| 71 | function TToken.Create(Kind: TTokenKind; Text: string): TToken;
|
|---|
| 72 | begin
|
|---|
| 73 | Result.Kind := Kind;
|
|---|
| 74 | Result.Text := Text;
|
|---|
| 75 | end;
|
|---|
| 76 |
|
|---|
| 77 | { TTokenizerPos }
|
|---|
| 78 |
|
|---|
| 79 | procedure TTokenizerPos.Increment;
|
|---|
| 80 | begin
|
|---|
| 81 | Inc(Index);
|
|---|
| 82 | Inc(Pos.X);
|
|---|
| 83 | end;
|
|---|
| 84 |
|
|---|
| 85 | procedure TTokenizerPos.IncrementLine;
|
|---|
| 86 | begin
|
|---|
| 87 | Inc(Pos.Y);
|
|---|
| 88 | Pos.X := 1;
|
|---|
| 89 | Inc(Index);
|
|---|
| 90 | end;
|
|---|
| 91 |
|
|---|
| 92 | { TTokenizer }
|
|---|
| 93 |
|
|---|
| 94 | function TTokenizer.IsAlpha(C: Char): Boolean;
|
|---|
| 95 | begin
|
|---|
| 96 | Result := (C in ['A'..'Z']) or (C in ['a'..'z'])
|
|---|
| 97 | end;
|
|---|
| 98 |
|
|---|
| 99 | function TTokenizer.IsNumeric(C: Char): Boolean;
|
|---|
| 100 | begin
|
|---|
| 101 | Result := C in ['0'..'9'];
|
|---|
| 102 | end;
|
|---|
| 103 |
|
|---|
| 104 | function TTokenizer.IsAlphaNumeric(C: Char): Boolean;
|
|---|
| 105 | begin
|
|---|
| 106 | Result := IsAlpha(C) or IsNumeric(C);
|
|---|
| 107 | end;
|
|---|
| 108 |
|
|---|
| 109 | function TTokenizer.IsWhiteSpace(C: Char): Boolean;
|
|---|
| 110 | begin
|
|---|
| 111 | Result := (C = ' ') or (C = #9);
|
|---|
| 112 | end;
|
|---|
| 113 |
|
|---|
| 114 | function TTokenizer.IsSpecialSymbol(C: Char): Boolean;
|
|---|
| 115 | begin
|
|---|
| 116 | Result := (C = ';') or (C = '.') or (C = '(') or (C = ')') or (C = '=') or
|
|---|
| 117 | (C = ':') or (C = '+') or (C = '-') or (C = ',') or (C = '/') or
|
|---|
| 118 | (C = '<') or (C = '>') or (C = '*');
|
|---|
| 119 | end;
|
|---|
| 120 |
|
|---|
| 121 | function TTokenizer.IsSpecialSymbol2(Text: string): Boolean;
|
|---|
| 122 | begin
|
|---|
| 123 | Result := (Text = ':=') or (Text = '//') or (Text = '<>') or (Text = '<=') or
|
|---|
| 124 | (Text = '>=');
|
|---|
| 125 | end;
|
|---|
| 126 |
|
|---|
| 127 | function TTokenizer.IsIdentifier(Text: string): Boolean;
|
|---|
| 128 | var
|
|---|
| 129 | I: Integer;
|
|---|
| 130 | begin
|
|---|
| 131 | Result := True;
|
|---|
| 132 | for I := 1 to Length(Text) do begin
|
|---|
| 133 | if (I = 1) then begin
|
|---|
| 134 | if not IsAlpha(Text[I]) then begin
|
|---|
| 135 | Result := False;
|
|---|
| 136 | Break;
|
|---|
| 137 | end;
|
|---|
| 138 | end else begin
|
|---|
| 139 | if not IsAlphaNumeric(Text[I]) then begin
|
|---|
| 140 | Result := False;
|
|---|
| 141 | Break;
|
|---|
| 142 | end;
|
|---|
| 143 | end;
|
|---|
| 144 | end;
|
|---|
| 145 | end;
|
|---|
| 146 |
|
|---|
| 147 | function TTokenizer.IsOperator(Text: string): Boolean;
|
|---|
| 148 | begin
|
|---|
| 149 | Result := (Text = '+') or (Text = '-') or (Text = '=') or (Text = '<>') or
|
|---|
| 150 | (Text = '*') or (Text = '/') or (Text = 'div') or (Text = '<=') or
|
|---|
| 151 | (Text = '>=') or (Text = 'mod') or (Text = 'shl') or (Text = 'shr') or
|
|---|
| 152 | (Text = 'and') or (Text = 'or') or (Text = 'xor') or (Text = 'not') or
|
|---|
| 153 | (Text = '>') or (Text = '<');
|
|---|
| 154 | end;
|
|---|
| 155 |
|
|---|
| 156 | function TTokenizer.IsKeyword(Text: string): Boolean;
|
|---|
| 157 | begin
|
|---|
| 158 | Result := (Text = 'begin') or (Text = 'end') or (Text = 'program') or
|
|---|
| 159 | (Text = 'var') or (Text = 'const') or (Text = 'if') or (Text = 'then') or
|
|---|
| 160 | (Text = 'else') or (Text = 'while') or (Text = 'do') or (Text = 'for') or
|
|---|
| 161 | (Text = 'to') or (Text = 'repeat') or (Text = 'until') or (Text = 'break') or
|
|---|
| 162 | (Text = 'continue') or (Text = 'function') or (Text = 'procedure');
|
|---|
| 163 | end;
|
|---|
| 164 |
|
|---|
| 165 | procedure TTokenizer.Init;
|
|---|
| 166 | begin
|
|---|
| 167 | Pos.Index := 1;
|
|---|
| 168 | Pos.Pos := Point(1, 1);
|
|---|
| 169 | end;
|
|---|
| 170 |
|
|---|
| 171 | function TTokenizer.GetNext: TToken;
|
|---|
| 172 | var
|
|---|
| 173 | C: Char;
|
|---|
| 174 | begin
|
|---|
| 175 | State := tsNone;
|
|---|
| 176 | Result.Text := '';
|
|---|
| 177 | Result.Kind := tkUnknown;
|
|---|
| 178 | while Pos.Index < Length(Source) do begin
|
|---|
| 179 | C := Source[Pos.Index];
|
|---|
| 180 | if State = tsNone then begin
|
|---|
| 181 | if IsAlpha(C) then begin
|
|---|
| 182 | State := tsIdentifier;
|
|---|
| 183 | Result.Kind := tkIdentifier;
|
|---|
| 184 | Result.Text := C;
|
|---|
| 185 | Pos.Increment;
|
|---|
| 186 | end else
|
|---|
| 187 | if IsWhiteSpace(C) then begin
|
|---|
| 188 | Pos.Increment;
|
|---|
| 189 | end else
|
|---|
| 190 | if C = #10 then begin
|
|---|
| 191 | Pos.IncrementLine;
|
|---|
| 192 | end else
|
|---|
| 193 | if IsSpecialSymbol(C) then begin
|
|---|
| 194 | Result.Kind := tkSpecialSymbol;
|
|---|
| 195 | Result.Text := C;
|
|---|
| 196 | Pos.Increment;
|
|---|
| 197 | State := tsSpecialSymbol;
|
|---|
| 198 | end else
|
|---|
| 199 | if C = '''' then begin
|
|---|
| 200 | Pos.Increment;
|
|---|
| 201 | Result.Kind := tkString;
|
|---|
| 202 | State := tsString;
|
|---|
| 203 | end else
|
|---|
| 204 | if IsNumeric(C) then begin
|
|---|
| 205 | State := tsNumber;
|
|---|
| 206 | Result.Kind := tkNumber;
|
|---|
| 207 | Result.Text := C;
|
|---|
| 208 | Pos.Increment;
|
|---|
| 209 | end else begin
|
|---|
| 210 | Error(Format(SUnknownToken, [C]));
|
|---|
| 211 | Break;
|
|---|
| 212 | end;
|
|---|
| 213 | end else
|
|---|
| 214 | if State = tsIdentifier then begin
|
|---|
| 215 | if IsAlphanumeric(C) then begin
|
|---|
| 216 | Result.Text := Result.Text + C;
|
|---|
| 217 | Pos.Increment;
|
|---|
| 218 | end else begin
|
|---|
| 219 | if IsKeyword(Result.Text) then
|
|---|
| 220 | Result.Kind := tkKeyword;
|
|---|
| 221 | Break;
|
|---|
| 222 | end;
|
|---|
| 223 | end else
|
|---|
| 224 | if State = tsNumber then begin
|
|---|
| 225 | if IsNumeric(C) then begin
|
|---|
| 226 | Result.Text := Result.Text + C;
|
|---|
| 227 | Pos.Increment;
|
|---|
| 228 | end else begin
|
|---|
| 229 | Break;
|
|---|
| 230 | end;
|
|---|
| 231 | end else
|
|---|
| 232 | if State = tsString then begin
|
|---|
| 233 | if C = '''' then begin
|
|---|
| 234 | State := tsStringEnd;
|
|---|
| 235 | Pos.Increment;
|
|---|
| 236 | end else begin
|
|---|
| 237 | Result.Text := Result.Text + C;
|
|---|
| 238 | Pos.Increment;
|
|---|
| 239 | end;
|
|---|
| 240 | end else
|
|---|
| 241 | if State = tsStringEnd then begin
|
|---|
| 242 | if C = '''' then begin
|
|---|
| 243 | State := tsString;
|
|---|
| 244 | Result.Text := Result.Text + C;
|
|---|
| 245 | Pos.Increment;
|
|---|
| 246 | end else begin
|
|---|
| 247 | Break;
|
|---|
| 248 | end;
|
|---|
| 249 | end else
|
|---|
| 250 | if State = tsLineComment then begin
|
|---|
| 251 | if C = #10 then begin
|
|---|
| 252 | State := tsNone;
|
|---|
| 253 | end else Pos.Increment;
|
|---|
| 254 | end else
|
|---|
| 255 | if State = tsSpecialSymbol then begin
|
|---|
| 256 | if IsSpecialSymbol2(Result.Text + C) then begin
|
|---|
| 257 | Result.Text := Result.Text + C;
|
|---|
| 258 | Pos.Increment;
|
|---|
| 259 | if Result.Text = '//' then begin
|
|---|
| 260 | Result.Text := '';
|
|---|
| 261 | State := tsLineComment;
|
|---|
| 262 | end else Break;
|
|---|
| 263 | end else begin
|
|---|
| 264 | Break;
|
|---|
| 265 | end;
|
|---|
| 266 | end else
|
|---|
| 267 | raise Exception.Create(SUnsupportedTokenizerState);
|
|---|
| 268 | end;
|
|---|
| 269 | end;
|
|---|
| 270 |
|
|---|
| 271 | function TTokenizer.CheckNext(Text: string; Kind: TTokenKind): Boolean;
|
|---|
| 272 | var
|
|---|
| 273 | LastPos: TTokenizerPos;
|
|---|
| 274 | Token: TToken;
|
|---|
| 275 | begin
|
|---|
| 276 | LastPos := Pos;
|
|---|
| 277 | Token := GetNext;
|
|---|
| 278 | Result := (Token.Text = Text) and (Token.Kind = Kind);
|
|---|
| 279 | Pos := LastPos;
|
|---|
| 280 | end;
|
|---|
| 281 |
|
|---|
| 282 | function TTokenizer.CheckNextAndRead(Text: string; Kind: TTokenKind): Boolean;
|
|---|
| 283 | var
|
|---|
| 284 | LastPos: TTokenizerPos;
|
|---|
| 285 | Token: TToken;
|
|---|
| 286 | begin
|
|---|
| 287 | LastPos := Pos;
|
|---|
| 288 | Token := GetNext;
|
|---|
| 289 | Result := (Token.Text = Text) and (Token.Kind = Kind);
|
|---|
| 290 | if not Result then Pos := LastPos;
|
|---|
| 291 | end;
|
|---|
| 292 |
|
|---|
| 293 | function TTokenizer.CheckNextKind(Kind: TTokenKind): Boolean;
|
|---|
| 294 | var
|
|---|
| 295 | LastPos: TTokenizerPos;
|
|---|
| 296 | Token: TToken;
|
|---|
| 297 | begin
|
|---|
| 298 | LastPos := Pos;
|
|---|
| 299 | Token := GetNext;
|
|---|
| 300 | Result := Token.Kind = Kind;
|
|---|
| 301 | Pos := LastPos;
|
|---|
| 302 | end;
|
|---|
| 303 |
|
|---|
| 304 | function TTokenizer.CheckNextKindAndRead(Kind: TTokenKind): Boolean;
|
|---|
| 305 | var
|
|---|
| 306 | LastPos: TTokenizerPos;
|
|---|
| 307 | Token: TToken;
|
|---|
| 308 | begin
|
|---|
| 309 | LastPos := Pos;
|
|---|
| 310 | Token := GetNext;
|
|---|
| 311 | Result := Token.Kind = Kind;
|
|---|
| 312 | if not Result then Pos := LastPos;
|
|---|
| 313 | end;
|
|---|
| 314 |
|
|---|
| 315 | procedure TTokenizer.Expect(Text: string; Kind: TTokenKind);
|
|---|
| 316 | var
|
|---|
| 317 | Token: TToken;
|
|---|
| 318 | begin
|
|---|
| 319 | Token := GetNext;
|
|---|
| 320 | if (Token.Text <> Text) or (Token.Kind <> Kind) then
|
|---|
| 321 | Error(Format(SExpectedButFound, [Text, Token.Text]));
|
|---|
| 322 | end;
|
|---|
| 323 |
|
|---|
| 324 | procedure TTokenizer.Error(Text: string);
|
|---|
| 325 | begin
|
|---|
| 326 | if Assigned(OnError) then
|
|---|
| 327 | OnError(Pos.Pos, Text);
|
|---|
| 328 | end;
|
|---|
| 329 |
|
|---|
| 330 | end.
|
|---|
| 331 |
|
|---|