1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__)) 5 #include <io.h> 6 #else 7 #include <unistd.h> 8 #endif 9 #include "cpp.h" 10 /* 11 * lexical FSM encoding 12 * when in state state, and one of the characters 13 * in ch arrives, enter nextstate. 14 * States >= S_SELF are either final, or at least require special action. 15 * In 'fsm' there is a line for each state X charset X nextstate. 16 * List chars that overwrite previous entries later (e.g. C_ALPH 17 * can be overridden by '_' by a later entry; and C_XX is the 18 * the universal set, and should always be first. 19 * States above S_SELF are represented in the big table as negative values. 20 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 21 * These actions differ in that S_SELF doesn't have a lookahead char, 22 * S_SELFB does. 23 * 24 * The encoding is blown out into a big table for time-efficiency. 25 * Entries have 26 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 27 */ 28 29 #define MAXSTATE 32 30 #define ACT(tok,act) ((tok<<7)+act) 31 #define QBSBIT 0100 32 #define GETACT(st) ((st>>7)&0x1ff) 33 34 /* character classes */ 35 #define C_WS 1 36 #define C_ALPH 2 37 #define C_NUM 3 38 #define C_EOF 4 39 #define C_XX 5 40 41 enum state 42 { 43 START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 44 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 45 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 46 S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 47 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 48 }; 49 50 int tottok; 51 int tokkind[256]; 52 struct fsm 53 { 54 int state; /* if in this state */ 55 uchar ch[4]; /* and see one of these characters */ 56 int nextstate; /* enter this state if +ve */ 57 }; 58 59 /*const*/ struct fsm fsm[] = { 60 /* start state */ 61 {START, {C_XX}, ACT(UNCLASS, S_SELF)}, 62 {START, {' ', '\t', '\v'}, WS1}, 63 {START, {C_NUM}, NUM1}, 64 {START, {'.'}, NUM3}, 65 {START, {C_ALPH}, ID1}, 66 {START, {'L'}, ST1}, 67 {START, {'"'}, ST2}, 68 {START, {'\''}, CC1}, 69 {START, {'/'}, COM1}, 70 {START, {EOFC}, S_EOF}, 71 {START, {'\n'}, S_NL}, 72 {START, {'-'}, MINUS1}, 73 {START, {'+'}, PLUS1}, 74 {START, {'<'}, LT1}, 75 {START, {'>'}, GT1}, 76 {START, {'='}, ASG1}, 77 {START, {'!'}, NOT1}, 78 {START, {'&'}, AND1}, 79 {START, {'|'}, OR1}, 80 {START, {'#'}, SHARP1}, 81 {START, {'%'}, PCT1}, 82 {START, {'['}, ACT(SBRA, S_SELF)}, 83 {START, {']'}, ACT(SKET, S_SELF)}, 84 {START, {'('}, ACT(LP, S_SELF)}, 85 {START, {')'}, ACT(RP, S_SELF)}, 86 {START, {'*'}, STAR1}, 87 {START, {','}, ACT(COMMA, S_SELF)}, 88 {START, {'?'}, ACT(QUEST, S_SELF)}, 89 {START, {':'}, ACT(COLON, S_SELF)}, 90 {START, {';'}, ACT(SEMIC, S_SELF)}, 91 {START, {'{'}, ACT(CBRA, S_SELF)}, 92 {START, {'}'}, ACT(CKET, S_SELF)}, 93 {START, {'~'}, ACT(TILDE, S_SELF)}, 94 {START, {'^'}, CIRC1}, 95 96 /* saw a digit */ 97 {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)}, 98 {NUM1, {C_NUM, C_ALPH, '.'}, NUM1}, 99 {NUM1, {'E', 'e'}, NUM2}, 100 {NUM1, {'_'}, ACT(NUMBER, S_SELFB)}, 101 102 /* saw possible start of exponent, digits-e */ 103 {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)}, 104 {NUM2, {'+', '-'}, NUM1}, 105 {NUM2, {C_NUM, C_ALPH}, NUM1}, 106 {NUM2, {'_'}, ACT(NUMBER, S_SELFB)}, 107 108 /* saw a '.', which could be a number or an operator */ 109 {NUM3, {C_XX}, ACT(DOT, S_SELFB)}, 110 {NUM3, {'.'}, DOTS1}, 111 {NUM3, {C_NUM}, NUM1}, 112 113 {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)}, 114 {DOTS1, {C_NUM}, NUM1}, 115 {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)}, 116 117 /* saw a letter or _ */ 118 {ID1, {C_XX}, ACT(NAME, S_NAME)}, 119 {ID1, {C_ALPH, C_NUM}, ID1}, 120 121 /* saw L (start of wide string?) */ 122 {ST1, {C_XX}, ACT(NAME, S_NAME)}, 123 {ST1, {C_ALPH, C_NUM}, ID1}, 124 {ST1, {'"'}, ST2}, 125 {ST1, {'\''}, CC1}, 126 127 /* saw " beginning string */ 128 {ST2, {C_XX}, ST2}, 129 {ST2, {'"'}, ACT(STRING, S_SELF)}, 130 {ST2, {'\\'}, ST3}, 131 {ST2, {'\n'}, S_STNL}, 132 {ST2, {EOFC}, S_EOFSTR}, 133 134 /* saw \ in string */ 135 {ST3, {C_XX}, ST2}, 136 {ST3, {'\n'}, S_STNL}, 137 {ST3, {EOFC}, S_EOFSTR}, 138 139 /* saw ' beginning character const */ 140 {CC1, {C_XX}, CC1}, 141 {CC1, {'\''}, ACT(CCON, S_SELF)}, 142 {CC1, {'\\'}, CC2}, 143 {CC1, {'\n'}, S_STNL}, 144 {CC1, {EOFC}, S_EOFSTR}, 145 146 /* saw \ in ccon */ 147 {CC2, {C_XX}, CC1}, 148 {CC2, {'\n'}, S_STNL}, 149 {CC2, {EOFC}, S_EOFSTR}, 150 151 /* saw /, perhaps start of comment */ 152 {COM1, {C_XX}, ACT(SLASH, S_SELFB)}, 153 {COM1, {'='}, ACT(ASSLASH, S_SELF)}, 154 {COM1, {'*'}, COM2}, 155 {COM1, {'/'}, COM4}, 156 157 /* saw / followed by *, start of comment */ 158 {COM2, {C_XX}, COM2}, 159 {COM2, {'\n'}, S_COMNL}, 160 {COM2, {'*'}, COM3}, 161 {COM2, {EOFC}, S_EOFCOM}, 162 163 /* saw the * possibly ending a comment */ 164 {COM3, {C_XX}, COM2}, 165 {COM3, {'\n'}, S_COMNL}, 166 {COM3, {'*'}, COM3}, 167 {COM3, {'/'}, S_COMMENT}, 168 169 /* // comment */ 170 {COM4, {C_XX}, COM4}, 171 {COM4, {'\n'}, S_NL}, 172 {COM4, {EOFC}, S_EOFCOM}, 173 174 /* saw white space, eat it up */ 175 {WS1, {C_XX}, S_WS}, 176 {WS1, {'\t', '\v', ' '}, WS1}, 177 178 /* saw -, check --, -=, -> */ 179 {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)}, 180 {MINUS1, {'-'}, ACT(MMINUS, S_SELF)}, 181 {MINUS1, {'='}, ACT(ASMINUS, S_SELF)}, 182 {MINUS1, {'>'}, ACT(ARROW, S_SELF)}, 183 184 /* saw +, check ++, += */ 185 {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)}, 186 {PLUS1, {'+'}, ACT(PPLUS, S_SELF)}, 187 {PLUS1, {'='}, ACT(ASPLUS, S_SELF)}, 188 189 /* saw <, check <<, <<=, <= */ 190 {LT1, {C_XX}, ACT(LT, S_SELFB)}, 191 {LT1, {'<'}, LT2}, 192 {LT1, {'='}, ACT(LEQ, S_SELF)}, 193 {LT2, {C_XX}, ACT(LSH, S_SELFB)}, 194 {LT2, {'='}, ACT(ASLSH, S_SELF)}, 195 196 /* saw >, check >>, >>=, >= */ 197 {GT1, {C_XX}, ACT(GT, S_SELFB)}, 198 {GT1, {'>'}, GT2}, 199 {GT1, {'='}, ACT(GEQ, S_SELF)}, 200 {GT2, {C_XX}, ACT(RSH, S_SELFB)}, 201 {GT2, {'='}, ACT(ASRSH, S_SELF)}, 202 203 /* = */ 204 {ASG1, {C_XX}, ACT(ASGN, S_SELFB)}, 205 {ASG1, {'='}, ACT(EQ, S_SELF)}, 206 207 /* ! */ 208 {NOT1, {C_XX}, ACT(NOT, S_SELFB)}, 209 {NOT1, {'='}, ACT(NEQ, S_SELF)}, 210 211 /* & */ 212 {AND1, {C_XX}, ACT(AND, S_SELFB)}, 213 {AND1, {'&'}, ACT(LAND, S_SELF)}, 214 {AND1, {'='}, ACT(ASAND, S_SELF)}, 215 216 /* | */ 217 {OR1, {C_XX}, ACT(OR, S_SELFB)}, 218 {OR1, {'|'}, ACT(LOR, S_SELF)}, 219 {OR1, {'='}, ACT(ASOR, S_SELF)}, 220 221 /* # */ 222 {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)}, 223 {SHARP1, {'#'}, ACT(DSHARP, S_SELF)}, 224 225 /* % */ 226 {PCT1, {C_XX}, ACT(PCT, S_SELFB)}, 227 {PCT1, {'='}, ACT(ASPCT, S_SELF)}, 228 229 /* * */ 230 {STAR1, {C_XX}, ACT(STAR, S_SELFB)}, 231 {STAR1, {'='}, ACT(ASSTAR, S_SELF)}, 232 233 /* ^ */ 234 {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)}, 235 {CIRC1, {'='}, ACT(ASCIRC, S_SELF)}, 236 237 {-1, "", 0} 238 }; 239 240 /* first index is char, second is state */ 241 /* increase #states to power of 2 to encourage use of shift */ 242 short bigfsm[256][MAXSTATE]; 243 244 void 245 expandlex(void) 246 { 247 /* const */ struct fsm *fp; 248 int i, j, nstate; 249 250 for (fp = fsm; fp->state >= 0; fp++) 251 { 252 for (i = 0; fp->ch[i]; i++) 253 { 254 nstate = fp->nextstate; 255 if (nstate >= S_SELF) 256 nstate = ~nstate; 257 switch (fp->ch[i]) 258 { 259 260 case C_XX: /* random characters */ 261 for (j = 0; j < 256; j++) 262 bigfsm[j][fp->state] = (short) nstate; 263 continue; 264 case C_ALPH: 265 for (j = 0; j < 256; j++) 266 #ifdef S390 267 if( isalpha( j ) || (j == '_') ) 268 #else 269 if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z') 270 || j == '_') 271 #endif 272 bigfsm[j][fp->state] = (short) nstate; 273 continue; 274 case C_NUM: 275 for (j = '0'; j <= '9'; j++) 276 bigfsm[j][fp->state] = (short) nstate; 277 continue; 278 default: 279 bigfsm[fp->ch[i]][fp->state] = (short) nstate; 280 } 281 } 282 } 283 284 /* 285 * install special cases for ? (trigraphs), \ (splicing), runes, and 286 * EOB 287 */ 288 for (i = 0; i < MAXSTATE; i++) 289 { 290 for (j = 0; j < 0xFF; j++) 291 if (j == '?' || j == '\\' || j == '\n' || j == '\r') 292 { 293 if (bigfsm[j][i] > 0) 294 bigfsm[j][i] = ~bigfsm[j][i]; 295 bigfsm[j][i] &= ~QBSBIT; 296 } 297 bigfsm[EOB][i] = ~S_EOB; 298 if (bigfsm[EOFC][i] >= 0) 299 bigfsm[EOFC][i] = ~S_EOF; 300 } 301 } 302 303 void 304 fixlex(void) 305 { 306 /* do C++ comments? */ 307 if ((Cplusplus == 0) || (Cflag != 0)) 308 bigfsm['/'][COM1] = bigfsm['x'][COM1]; 309 } 310 311 /* 312 * fill in a row of tokens from input, terminated by NL or END 313 * First token is put at trp->lp. 314 * Reset is non-zero when the input buffer can be "rewound." 315 * The value is a flag indicating that possible macros have 316 * been seen in the row. 317 */ 318 int 319 gettokens(Tokenrow * trp, int reset) 320 { 321 register int c, state, oldstate; 322 register uchar *ip; 323 register Token *tp, *maxp; 324 int runelen; 325 Source *s = cursource; 326 int nmac = 0; 327 328 tp = trp->lp; 329 ip = s->inp; 330 if (reset) 331 { 332 s->lineinc = 0; 333 if (ip >= s->inl) 334 { /* nothing in buffer */ 335 s->inl = s->inb; 336 fillbuf(s); 337 ip = s->inp = s->inb; 338 } 339 else 340 if (ip >= s->inb + (3 * INS / 4)) 341 { 342 memmove(s->inb, ip, 4 + s->inl - ip); 343 s->inl = s->inb + (s->inl - ip); 344 ip = s->inp = s->inb; 345 } 346 } 347 maxp = &trp->bp[trp->max]; 348 runelen = 1; 349 for (;;) 350 { 351 continue2: 352 if (tp >= maxp) 353 { 354 trp->lp = tp; 355 tp = growtokenrow(trp); 356 maxp = &trp->bp[trp->max]; 357 } 358 tp->type = UNCLASS; 359 tp->t = ip; 360 tp->wslen = 0; 361 tp->flag = 0; 362 state = START; 363 for (;;) 364 { 365 oldstate = state; 366 367 c = *ip; 368 369 if ((state = bigfsm[c][state]) >= 0) 370 { 371 ip += runelen; 372 runelen = 1; 373 continue; 374 } 375 state = ~state; 376 reswitch: 377 switch (state & 0177) 378 { 379 case S_SELF: 380 ip += runelen; 381 runelen = 1; 382 case S_SELFB: 383 tp->type = (unsigned char) GETACT(state); 384 tp->len = ip - tp->t; 385 tp++; 386 goto continue2; 387 388 case S_NAME: /* like S_SELFB but with nmac check */ 389 tp->type = NAME; 390 tp->len = ip - tp->t; 391 nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0); 392 tp++; 393 goto continue2; 394 395 case S_WS: 396 tp->wslen = ip - tp->t; 397 tp->t = ip; 398 state = START; 399 continue; 400 401 default: 402 if ((state & QBSBIT) == 0) 403 { 404 ip += runelen; 405 runelen = 1; 406 continue; 407 } 408 state &= ~QBSBIT; 409 s->inp = ip; 410 411 if (c == '\n') 412 { 413 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); 414 415 if (s->inp[1] == '\r') 416 { 417 memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2); 418 s->inl -= 1; 419 } 420 421 goto reswitch; 422 } 423 424 if (c == '\r') 425 { 426 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); 427 428 if (s->inp[1] == '\n') 429 { 430 memmove(s->inp, s->inp + 1, s->inl - s->inp + 1); 431 s->inl -= 1; 432 } 433 else 434 *s->inp = '\n'; 435 436 state = oldstate; 437 continue; 438 } 439 440 if (c == '?') 441 { /* check trigraph */ 442 if (trigraph(s)) 443 { 444 state = oldstate; 445 continue; 446 } 447 goto reswitch; 448 } 449 if (c == '\\') 450 { /* line-folding */ 451 if (foldline(s)) 452 { 453 s->lineinc++; 454 state = oldstate; 455 continue; 456 } 457 goto reswitch; 458 } 459 error(WARNING, "Lexical botch in cpp"); 460 ip += runelen; 461 runelen = 1; 462 continue; 463 464 case S_EOB: 465 s->inp = ip; 466 fillbuf(cursource); 467 state = oldstate; 468 continue; 469 470 case S_EOF: 471 tp->type = END; 472 tp->len = 0; 473 s->inp = ip; 474 if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1) 475 error(WARNING, "No newline at end of file"); 476 trp->lp = tp + 1; 477 return nmac; 478 479 case S_STNL: 480 error(ERROR, "Unterminated string or char const"); 481 case S_NL: 482 tp->t = ip; 483 tp->type = NL; 484 tp->len = 1; 485 tp->wslen = 0; 486 s->lineinc++; 487 s->inp = ip + 1; 488 trp->lp = tp + 1; 489 return nmac; 490 491 case S_EOFSTR: 492 error(FATAL, "EOF in string or char constant"); 493 break; 494 495 case S_COMNL: 496 s->lineinc++; 497 state = COM2; 498 ip += runelen; 499 runelen = 1; 500 continue; 501 502 case S_EOFCOM: 503 error(WARNING, "EOF inside comment"); 504 --ip; 505 case S_COMMENT: 506 if (!Cflag) 507 { 508 tp->t = ++ip; 509 tp->t[-1] = ' '; 510 tp->wslen = 1; 511 state = START; 512 continue; 513 } 514 else 515 { 516 runelen = 1; 517 s->lineinc = 0;; 518 tp->type = COMMENT; 519 tp->flag |= XTWS; 520 } 521 } 522 break; 523 } 524 ip += runelen; 525 runelen = 1; 526 tp->len = ip - tp->t; 527 tp++; 528 } 529 } 530 531 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 532 int 533 trigraph(Source * s) 534 { 535 uchar c; 536 537 while (s->inp + 2 >= s->inl && fillbuf(s) != EOF); 538 ; 539 if (s->inp[1] != '?') 540 return 0; 541 c = 0; 542 switch (s->inp[2]) 543 { 544 case '=': 545 c = '#'; 546 break; 547 case '(': 548 c = '['; 549 break; 550 case '/': 551 c = '\\'; 552 break; 553 case ')': 554 c = ']'; 555 break; 556 case '\'': 557 c = '^'; 558 break; 559 case '<': 560 c = '{'; 561 break; 562 case '!': 563 c = '|'; 564 break; 565 case '>': 566 c = '}'; 567 break; 568 case '-': 569 c = '~'; 570 break; 571 } 572 if (c) 573 { 574 *s->inp = c; 575 memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2); 576 s->inl -= 2; 577 } 578 return c; 579 } 580 581 int 582 foldline(Source * s) 583 { 584 int n = 1; 585 586 /* skip pending wihite spaces */ 587 while ((s->inp[n] == ' ') || (s->inp[n] == '\t')) 588 { 589 n++; 590 if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF)) 591 break; 592 } 593 594 /* refill buffer */ 595 while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF); 596 597 /* skip DOS line ends */ 598 if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) || 599 ((s->inp[n] == '\n') && (s->inp[n+1] == '\r'))) 600 n++; 601 602 if ((s->inp[n] == '\n') || (s->inp[n] == '\r')) 603 { 604 memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2); 605 s->inl -= n + 1; 606 return 1; 607 } 608 return 0; 609 } 610 611 int 612 fillbuf(Source * s) 613 { 614 int n; 615 616 if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0) 617 n = 0; 618 s->inl += n; 619 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB; 620 if (n == 0) 621 { 622 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC; 623 return EOF; 624 } 625 return 0; 626 } 627 628 /* 629 * Push down to new source of characters. 630 * If fd>0 and str==NULL, then from a file `name'; 631 * if fd==-1 and str, then from the string. 632 */ 633 Source * 634 setsource(char *name, int path, int fd, char *str, int wrap) 635 { 636 Source *s = new(Source); 637 int len; 638 639 s->line = 1; 640 s->lineinc = 0; 641 s->fd = fd; 642 s->filename = name; 643 s->next = cursource; 644 s->ifdepth = 0; 645 s->pathdepth = path; 646 s->wrap = wrap; 647 648 cursource = s; 649 650 if (s->wrap) 651 genwrap(0); 652 653 /* slop at right for EOB */ 654 if (str) 655 { 656 len = strlen(str); 657 s->inb = domalloc(len + 4); 658 s->inp = s->inb; 659 strncpy((char *) s->inp, str, len); 660 } 661 else 662 { 663 s->inb = domalloc(INS + 4); 664 s->inp = s->inb; 665 len = 0; 666 } 667 s->inl = s->inp + len; 668 s->inl[0] = s->inl[1] = EOB; 669 670 return s; 671 } 672 673 void 674 unsetsource(void) 675 { 676 Source *s = cursource; 677 678 if (s->wrap) 679 genwrap(1); 680 681 if (s->fd >= 0) 682 { 683 close(s->fd); 684 dofree(s->inb); 685 } 686 cursource = s->next; 687 dofree(s); 688 } 689