00001
00002
00003
00004
00005
00006
00007
00008
00009 #include "comma/parser/Lexer.h"
00010 #include <iostream>
00011 #include <cstring>
00012
00013 using namespace comma;
00014
00015 Lexer::Lexer(TextProvider &txtProvider, Diagnostic &diag)
00016 : txtProvider(txtProvider),
00017 diagnostic(diag),
00018 currentIter(txtProvider.begin()),
00019 errorDetected(false)
00020 { }
00021
00022 const char *Lexer::Token::getString() const
00023 {
00024 return Lexer::tokenString(*this);
00025 }
00026
00027 const char *Lexer::tokenString(const Code code)
00028 {
00029 const char *result;
00030
00031 switch (code) {
00032 default:
00033 result = 0;
00034 break;
00035
00036 #define KEYWORD(NAME, STRING) case TKN_ ## NAME: result = STRING; break;
00037 #define GLYPH(NAME, STRING) case TKN_ ## NAME: result = STRING; break;
00038 #include "comma/parser/Tokens.def"
00039 #undef KEYWORD
00040 #undef GLYPH
00041 }
00042
00043 return result;
00044 }
00045
00046 const char *Lexer::tokenString(const Token &token)
00047 {
00048 const char *result;
00049 Code code = token.getCode();
00050
00051 switch (code) {
00052 default:
00053 result = tokenString(code);
00054 break;
00055
00056 case TKN_IDENTIFIER:
00057 case TKN_INTEGER:
00058 case TKN_STRING:
00059 result = token.getRep();
00060 break;
00061 }
00062
00063 return result;
00064 }
00065
00066 bool Lexer::isDecimalDigit(unsigned c)
00067 {
00068 return ('0' <= c && c <= '9');
00069 }
00070
00071 bool Lexer::isInitialIdentifierChar(unsigned c)
00072 {
00073 if (('a' <= c && c <= 'z') ||
00074 ('A' <= c && c <= 'Z') ||
00075 (c == '%') || (c == '_'))
00076 return true;
00077
00078 return false;
00079 }
00080
00081 bool Lexer::isInnerIdentifierChar(unsigned c)
00082 {
00083 return isInitialIdentifierChar(c) || isDecimalDigit(c) || c == '?';
00084 }
00085
00086 bool Lexer::isWhitespace(unsigned c)
00087 {
00088 return (c == ' ') || (c == '\t') || (c == '\n');
00089 }
00090
00091 Location Lexer::currentLocation() const
00092 {
00093 return txtProvider.getLocation(currentIter);
00094 }
00095
00096
00097
00098 unsigned Lexer::readStream()
00099 {
00100 unsigned c = *currentIter;
00101 ++currentIter;
00102
00103
00104
00105 switch (c) {
00106
00107 case '\r':
00108 if (*currentIter == '\n')
00109 ++currentIter;
00110 case '\n':
00111 return '\n';
00112 }
00113
00114 return c;
00115 }
00116
00117 unsigned Lexer::peekStream()
00118 {
00119 unsigned c = *currentIter;
00120
00121 if (c == '\r')
00122 return '\n';
00123
00124 return c;
00125 }
00126
00127 void Lexer::ungetStream()
00128 {
00129 --currentIter;
00130 }
00131
00132 void Lexer::ignoreStream()
00133 {
00134 readStream();
00135 }
00136
00137 bool Lexer::eatComment()
00138 {
00139 unsigned c = peekStream();
00140
00141 if (c == '-') {
00142 ignoreStream();
00143 if (peekStream() == '-') {
00144
00145
00146 for (;;) {
00147 c = readStream();
00148 if (c == '\n' || c == 0)
00149 return true;
00150 }
00151 }
00152 else {
00153 ungetStream();
00154 return false;
00155 }
00156 }
00157 return false;
00158 }
00159
00160 bool Lexer::eatWhitespace()
00161 {
00162 unsigned c = peekStream();
00163
00164 if (isWhitespace(c)) {
00165 do {
00166 ignoreStream();
00167 } while (isWhitespace(c = peekStream()));
00168 return true;
00169 }
00170 return false;
00171 }
00172
00173 void Lexer::emitToken(Code code,
00174 const TextIterator &start, const TextIterator &end)
00175 {
00176 Location loc = txtProvider.getLocation(start);
00177 const char *string = &start;
00178 unsigned length = &end - &start;
00179 *targetToken = Token(code, loc, string, length);
00180 }
00181
00182 void Lexer::emitToken(Code code, Location loc)
00183 {
00184 *targetToken = Token(code, loc, 0, 0);
00185 }
00186
00187 void Lexer::emitStringToken(const TextIterator &start, const TextIterator &end)
00188 {
00189 emitToken(TKN_STRING, start, end);
00190 }
00191
00192 void Lexer::emitIntegerToken(const TextIterator &start, const TextIterator &end)
00193 {
00194 emitToken(TKN_INTEGER, start, end);
00195 }
00196
00197 void Lexer::emitIdentifierToken(const TextIterator &start, const TextIterator &end)
00198 {
00199 emitToken(TKN_IDENTIFIER, start, end);
00200 }
00201
00202 Lexer::Code Lexer::getTokenCode(TextIterator &start, TextIterator &end) const
00203 {
00204 Code code = UNUSED_ID;
00205 const char *str = &start;
00206 unsigned length = &end - &start;
00207
00208 switch (length) {
00209 case 1:
00210 if (strncmp(str, "%", length) == 0)
00211 code = TKN_PERCENT;
00212 break;
00213
00214 case 2:
00215 if (strncmp(str, "is", length) == 0)
00216 code = TKN_IS;
00217 else if (strncmp(str, "if", length) == 0)
00218 code = TKN_IS;
00219 break;
00220
00221 case 3:
00222 if (strncmp(str, "end", length) == 0)
00223 code = TKN_END;
00224 else if (strncmp(str, "add", length) == 0)
00225 code = TKN_ADD;
00226 break;
00227
00228 case 4:
00229 if (strncmp(str, "else", length) == 0)
00230 code = TKN_ELSE;
00231 else if (strncmp(str, "then", length) == 0)
00232 code = TKN_THEN;
00233 else if (strncmp(str, "with", length) == 0)
00234 code = TKN_WITH;
00235 break;
00236
00237 case 5:
00238 if (strncmp(str, "begin", length) == 0)
00239 code = TKN_BEGIN;
00240 else if (strncmp(str, "elsif", length) == 0)
00241 code = TKN_ELSIF;
00242 else if (strncmp(str, "while", length) == 0)
00243 code = TKN_WHILE;
00244 break;
00245
00246 case 6:
00247 if (strncmp(str, "domain", length) == 0)
00248 code = TKN_DOMAIN;
00249 else if (strncmp(str, "module", length) == 0)
00250 code = TKN_MODULE;
00251 else if (strncmp(str, "repeat", length) == 0)
00252 code = TKN_REPEAT;
00253 else if (strncmp(str, "return", length) == 0)
00254 code = TKN_RETURN;
00255 break;
00256
00257 case 8:
00258 if (strncmp(str, "function", length) == 0)
00259 code = TKN_FUNCTION;
00260 break;
00261
00262 case 9:
00263 if (strncmp(str, "signature", length) == 0)
00264 code = TKN_SIGNATURE;
00265 break;
00266 }
00267 return code;
00268 }
00269
00270 bool Lexer::scanWord()
00271 {
00272 TextIterator start = currentIter;
00273 unsigned c = peekStream();
00274
00275 if (isInitialIdentifierChar(c)) {
00276 Code code;
00277
00278 do {
00279 ignoreStream();
00280 } while (isInnerIdentifierChar(c = peekStream()));
00281
00282 code = getTokenCode(start, currentIter);
00283
00284 if (code == UNUSED_ID)
00285 emitIdentifierToken(start, currentIter);
00286 else
00287 emitToken(code, txtProvider.getLocation(start));
00288 return true;
00289 }
00290 return false;
00291 }
00292
00293 bool Lexer::scanGlyph()
00294 {
00295 Location loc = currentLocation();
00296 unsigned c = readStream();
00297 Code code = UNUSED_ID;
00298
00299 switch (c) {
00300 case '(':
00301 code = TKN_LPAREN;
00302 break;
00303
00304 case ')':
00305 code = TKN_RPAREN;
00306 break;
00307
00308 case ';':
00309 code = TKN_SEMI;
00310 break;
00311
00312 case '.':
00313 code = TKN_DOT;
00314 break;
00315
00316 case ':':
00317 switch (peekStream()) {
00318 case '=':
00319 ignoreStream();
00320 code = TKN_ASSIGN;
00321 break;
00322
00323 case ':':
00324 ignoreStream();
00325 code = TKN_DCOLON;
00326 break;
00327
00328 default:
00329 code = TKN_COLON;
00330 }
00331 break;
00332
00333 case ',':
00334 code = TKN_COMMA;
00335 break;
00336
00337 case '=':
00338 switch (peekStream()) {
00339 default:
00340 code = TKN_EQUAL;
00341 break;
00342
00343 case '>':
00344 ignoreStream();
00345 code = TKN_RDARROW;
00346 break;
00347 }
00348 break;
00349
00350 case '[':
00351 code = TKN_LBRACK;
00352 break;
00353
00354 case ']':
00355 code = TKN_RBRACK;
00356 break;
00357
00358 case '{':
00359 code = TKN_LBRACE;
00360 break;
00361
00362 case '}':
00363 code = TKN_RBRACE;
00364 break;
00365
00366 case '+':
00367 code = TKN_PLUS;
00368 break;
00369
00370 case '-':
00371 code = TKN_MINUS;
00372 break;
00373
00374 case '*':
00375 code = TKN_STAR;
00376 break;
00377
00378 case '~':
00379 switch (peekStream()) {
00380 case '=':
00381 ignoreStream();
00382 code = TKN_NEQUAL;
00383 break;
00384
00385 default:
00386 code = TKN_TILDE;
00387 }
00388 break;
00389 }
00390
00391 if (code == UNUSED_ID) {
00392 ungetStream();
00393 return false;
00394 }
00395
00396 emitToken(code, loc);
00397 return true;
00398 }
00399
00400 bool Lexer::scanEscape()
00401 {
00402 Location loc = currentLocation();
00403 unsigned c;
00404
00405 switch (c = readStream()) {
00406 case '\\': break;
00407 case '"' : break;
00408 case '\'': break;
00409 case 't' : break;
00410 case 'n' : break;
00411 case 'r' : break;
00412 case 'b' : break;
00413
00414 case 0:
00415
00416
00417 errorDetected = true;
00418 ungetStream();
00419 return false;
00420
00421 default:
00422
00423 report(loc, diag::ILLEGAL_ESCAPE) << (char)c;
00424 errorDetected = true;
00425 return false;
00426 }
00427 return true;
00428 }
00429
00430 bool Lexer::scanString()
00431 {
00432 TextIterator start = currentIter;
00433 Location loc = currentLocation();
00434 unsigned c;
00435
00436 if (peekStream() == '"') {
00437 ignoreStream();
00438
00439 for (;;) {
00440 switch (c = readStream()) {
00441 case '\\':
00442
00443
00444 scanEscape();
00445 break;
00446
00447 case 0:
00448
00449
00450 report(loc, diag::UNTERMINATED_STRING);
00451 errorDetected = true;
00452 emitStringToken(start, currentIter);
00453 return true;
00454
00455 case '\n':
00456
00457 report(loc, diag::NEWLINE_IN_STRING_LIT);
00458 errorDetected = true;
00459 emitStringToken(start, currentIter);
00460 return true;
00461
00462 case '"':
00463
00464 emitStringToken(start, currentIter);
00465 return true;
00466 }
00467 }
00468 }
00469 return false;
00470 }
00471
00472 bool Lexer::scanNumeric()
00473 {
00474 Location loc = currentLocation();
00475 TextIterator start = currentIter;
00476 unsigned c = peekStream();
00477
00478 if (isDecimalDigit(c)) {
00479 ignoreStream();
00480
00481
00482
00483
00484 if (c == '0' && isDecimalDigit(peekStream())) {
00485 report(loc, diag::LEADING_ZERO_IN_INTEGER_LIT);
00486 errorDetected = true;
00487
00488 while (peekStream() == '0') ignoreStream();
00489
00490
00491
00492 if (!isDecimalDigit(peekStream())) {
00493 TextIterator end = start;
00494 emitIntegerToken(start, ++end);
00495 return true;
00496 }
00497 else c = readStream();
00498 }
00499
00500 for (;;) {
00501 c = readStream();
00502
00503 if (isDecimalDigit(c) || c == '_')
00504 continue;
00505 else {
00506 ungetStream();
00507 break;
00508 }
00509 }
00510 emitIntegerToken(start, currentIter);
00511 return true;
00512 }
00513 return false;
00514 }
00515
00516 void Lexer::scan(Token &tkn)
00517 {
00518 targetToken = &tkn;
00519
00520 for (;;) {
00521 eatWhitespace();
00522 while (eatComment()) eatWhitespace();
00523
00524 if (peekStream() == 0) {
00525 emitToken(TKN_EOT, Location());
00526 return;
00527 }
00528
00529 if (scanWord()) return;
00530 if (scanGlyph()) return;
00531 if (scanString()) return;
00532 if (scanNumeric()) return;
00533
00534
00535
00536 report(diag::INVALID_CHARACTER) << (char)readStream();
00537 errorDetected = true;
00538 continue;
00539 }
00540 }