1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__)) 26 #include <io.h> 27 #else 28 #include <unistd.h> 29 #endif 30 #include "cpp.h" 31 /* 32 * lexical FSM encoding 33 * when in state state, and one of the characters 34 * in ch arrives, enter nextstate. 35 * States >= S_SELF are either final, or at least require special action. 36 * In 'fsm' there is a line for each state X charset X nextstate. 37 * List chars that overwrite previous entries later (e.g. C_ALPH 38 * can be overridden by '_' by a later entry; and C_XX is the 39 * the universal set, and should always be first. 40 * States above S_SELF are represented in the big table as negative values. 41 * S_SELF and S_SELFB encode the resulting token type in the upper bits. 42 * These actions differ in that S_SELF doesn't have a lookahead char, 43 * S_SELFB does. 44 * 45 * The encoding is blown out into a big table for time-efficiency. 46 * Entries have 47 * nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits. 48 */ 49 50 #define MAXSTATE 32 51 #define ACT(tok,act) ((tok<<7)+act) 52 #define QBSBIT 0100 53 #define GETACT(st) ((st>>7)&0x1ff) 54 55 /* character classes */ 56 #define C_WS 1 57 #define C_ALPH 2 58 #define C_NUM 3 59 #define C_EOF 4 60 #define C_XX 5 61 62 enum state 63 { 64 START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4, 65 CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1, 66 CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1, 67 S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR, 68 S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME 69 }; 70 71 int tottok; 72 int tokkind[256]; 73 struct fsm 74 { 75 int state; /* if in this state */ 76 uchar ch[4]; /* and see one of these characters */ 77 int nextstate; /* enter this state if +ve */ 78 }; 79 80 /*const*/ struct fsm fsm[] = { 81 /* start state */ 82 {START, {C_XX}, ACT(UNCLASS, S_SELF)}, 83 {START, {' ', '\t', '\v'}, WS1}, 84 {START, {C_NUM}, NUM1}, 85 {START, {'.'}, NUM3}, 86 {START, {C_ALPH}, ID1}, 87 {START, {'L'}, ST1}, 88 {START, {'"'}, ST2}, 89 {START, {'\''}, CC1}, 90 {START, {'/'}, COM1}, 91 {START, {EOFC}, S_EOF}, 92 {START, {'\n'}, S_NL}, 93 {START, {'-'}, MINUS1}, 94 {START, {'+'}, PLUS1}, 95 {START, {'<'}, LT1}, 96 {START, {'>'}, GT1}, 97 {START, {'='}, ASG1}, 98 {START, {'!'}, NOT1}, 99 {START, {'&'}, AND1}, 100 {START, {'|'}, OR1}, 101 {START, {'#'}, SHARP1}, 102 {START, {'%'}, PCT1}, 103 {START, {'['}, ACT(SBRA, S_SELF)}, 104 {START, {']'}, ACT(SKET, S_SELF)}, 105 {START, {'('}, ACT(LP, S_SELF)}, 106 {START, {')'}, ACT(RP, S_SELF)}, 107 {START, {'*'}, STAR1}, 108 {START, {','}, ACT(COMMA, S_SELF)}, 109 {START, {'?'}, ACT(QUEST, S_SELF)}, 110 {START, {':'}, ACT(COLON, S_SELF)}, 111 {START, {';'}, ACT(SEMIC, S_SELF)}, 112 {START, {'{'}, ACT(CBRA, S_SELF)}, 113 {START, {'}'}, ACT(CKET, S_SELF)}, 114 {START, {'~'}, ACT(TILDE, S_SELF)}, 115 {START, {'^'}, CIRC1}, 116 117 /* saw a digit */ 118 {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)}, 119 {NUM1, {C_NUM, C_ALPH, '.'}, NUM1}, 120 {NUM1, {'E', 'e'}, NUM2}, 121 {NUM1, {'_'}, ACT(NUMBER, S_SELFB)}, 122 123 /* saw possible start of exponent, digits-e */ 124 {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)}, 125 {NUM2, {'+', '-'}, NUM1}, 126 {NUM2, {C_NUM, C_ALPH}, NUM1}, 127 {NUM2, {'_'}, ACT(NUMBER, S_SELFB)}, 128 129 /* saw a '.', which could be a number or an operator */ 130 {NUM3, {C_XX}, ACT(DOT, S_SELFB)}, 131 {NUM3, {'.'}, DOTS1}, 132 {NUM3, {C_NUM}, NUM1}, 133 134 {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)}, 135 {DOTS1, {C_NUM}, NUM1}, 136 {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)}, 137 138 /* saw a letter or _ */ 139 {ID1, {C_XX}, ACT(NAME, S_NAME)}, 140 {ID1, {C_ALPH, C_NUM}, ID1}, 141 142 /* saw L (start of wide string?) */ 143 {ST1, {C_XX}, ACT(NAME, S_NAME)}, 144 {ST1, {C_ALPH, C_NUM}, ID1}, 145 {ST1, {'"'}, ST2}, 146 {ST1, {'\''}, CC1}, 147 148 /* saw " beginning string */ 149 {ST2, {C_XX}, ST2}, 150 {ST2, {'"'}, ACT(STRING, S_SELF)}, 151 {ST2, {'\\'}, ST3}, 152 {ST2, {'\n'}, S_STNL}, 153 {ST2, {EOFC}, S_EOFSTR}, 154 155 /* saw \ in string */ 156 {ST3, {C_XX}, ST2}, 157 {ST3, {'\n'}, S_STNL}, 158 {ST3, {EOFC}, S_EOFSTR}, 159 160 /* saw ' beginning character const */ 161 {CC1, {C_XX}, CC1}, 162 {CC1, {'\''}, ACT(CCON, S_SELF)}, 163 {CC1, {'\\'}, CC2}, 164 {CC1, {'\n'}, S_STNL}, 165 {CC1, {EOFC}, S_EOFSTR}, 166 167 /* saw \ in ccon */ 168 {CC2, {C_XX}, CC1}, 169 {CC2, {'\n'}, S_STNL}, 170 {CC2, {EOFC}, S_EOFSTR}, 171 172 /* saw /, perhaps start of comment */ 173 {COM1, {C_XX}, ACT(SLASH, S_SELFB)}, 174 {COM1, {'='}, ACT(ASSLASH, S_SELF)}, 175 {COM1, {'*'}, COM2}, 176 {COM1, {'/'}, COM4}, 177 178 /* saw / followed by *, start of comment */ 179 {COM2, {C_XX}, COM2}, 180 {COM2, {'\n'}, S_COMNL}, 181 {COM2, {'*'}, COM3}, 182 {COM2, {EOFC}, S_EOFCOM}, 183 184 /* saw the * possibly ending a comment */ 185 {COM3, {C_XX}, COM2}, 186 {COM3, {'\n'}, S_COMNL}, 187 {COM3, {'*'}, COM3}, 188 {COM3, {'/'}, S_COMMENT}, 189 190 /* // comment */ 191 {COM4, {C_XX}, COM4}, 192 {COM4, {'\n'}, S_NL}, 193 {COM4, {EOFC}, S_EOFCOM}, 194 195 /* saw white space, eat it up */ 196 {WS1, {C_XX}, S_WS}, 197 {WS1, {'\t', '\v', ' '}, WS1}, 198 199 /* saw -, check --, -=, -> */ 200 {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)}, 201 {MINUS1, {'-'}, ACT(MMINUS, S_SELF)}, 202 {MINUS1, {'='}, ACT(ASMINUS, S_SELF)}, 203 {MINUS1, {'>'}, ACT(ARROW, S_SELF)}, 204 205 /* saw +, check ++, += */ 206 {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)}, 207 {PLUS1, {'+'}, ACT(PPLUS, S_SELF)}, 208 {PLUS1, {'='}, ACT(ASPLUS, S_SELF)}, 209 210 /* saw <, check <<, <<=, <= */ 211 {LT1, {C_XX}, ACT(LT, S_SELFB)}, 212 {LT1, {'<'}, LT2}, 213 {LT1, {'='}, ACT(LEQ, S_SELF)}, 214 {LT2, {C_XX}, ACT(LSH, S_SELFB)}, 215 {LT2, {'='}, ACT(ASLSH, S_SELF)}, 216 217 /* saw >, check >>, >>=, >= */ 218 {GT1, {C_XX}, ACT(GT, S_SELFB)}, 219 {GT1, {'>'}, GT2}, 220 {GT1, {'='}, ACT(GEQ, S_SELF)}, 221 {GT2, {C_XX}, ACT(RSH, S_SELFB)}, 222 {GT2, {'='}, ACT(ASRSH, S_SELF)}, 223 224 /* = */ 225 {ASG1, {C_XX}, ACT(ASGN, S_SELFB)}, 226 {ASG1, {'='}, ACT(EQ, S_SELF)}, 227 228 /* ! */ 229 {NOT1, {C_XX}, ACT(NOT, S_SELFB)}, 230 {NOT1, {'='}, ACT(NEQ, S_SELF)}, 231 232 /* & */ 233 {AND1, {C_XX}, ACT(AND, S_SELFB)}, 234 {AND1, {'&'}, ACT(LAND, S_SELF)}, 235 {AND1, {'='}, ACT(ASAND, S_SELF)}, 236 237 /* | */ 238 {OR1, {C_XX}, ACT(OR, S_SELFB)}, 239 {OR1, {'|'}, ACT(LOR, S_SELF)}, 240 {OR1, {'='}, ACT(ASOR, S_SELF)}, 241 242 /* # */ 243 {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)}, 244 {SHARP1, {'#'}, ACT(DSHARP, S_SELF)}, 245 246 /* % */ 247 {PCT1, {C_XX}, ACT(PCT, S_SELFB)}, 248 {PCT1, {'='}, ACT(ASPCT, S_SELF)}, 249 250 /* * */ 251 {STAR1, {C_XX}, ACT(STAR, S_SELFB)}, 252 {STAR1, {'='}, ACT(ASSTAR, S_SELF)}, 253 254 /* ^ */ 255 {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)}, 256 {CIRC1, {'='}, ACT(ASCIRC, S_SELF)}, 257 258 {-1, "", 0} 259 }; 260 261 /* first index is char, second is state */ 262 /* increase #states to power of 2 to encourage use of shift */ 263 short bigfsm[256][MAXSTATE]; 264 265 void 266 expandlex(void) 267 { 268 /* const */ struct fsm *fp; 269 int i, j, nstate; 270 271 for (fp = fsm; fp->state >= 0; fp++) 272 { 273 for (i = 0; fp->ch[i]; i++) 274 { 275 nstate = fp->nextstate; 276 if (nstate >= S_SELF) 277 nstate = ~nstate; 278 switch (fp->ch[i]) 279 { 280 281 case C_XX: /* random characters */ 282 for (j = 0; j < 256; j++) 283 bigfsm[j][fp->state] = (short) nstate; 284 continue; 285 case C_ALPH: 286 for (j = 0; j < 256; j++) 287 #ifdef S390 288 if( isalpha( j ) || (j == '_') ) 289 #else 290 if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z') 291 || j == '_') 292 #endif 293 bigfsm[j][fp->state] = (short) nstate; 294 continue; 295 case C_NUM: 296 for (j = '0'; j <= '9'; j++) 297 bigfsm[j][fp->state] = (short) nstate; 298 continue; 299 default: 300 bigfsm[fp->ch[i]][fp->state] = (short) nstate; 301 } 302 } 303 } 304 305 /* 306 * install special cases for ? (trigraphs), \ (splicing), runes, and 307 * EOB 308 */ 309 for (i = 0; i < MAXSTATE; i++) 310 { 311 for (j = 0; j < 0xFF; j++) 312 if (j == '?' || j == '\\' || j == '\n' || j == '\r') 313 { 314 if (bigfsm[j][i] > 0) 315 bigfsm[j][i] = ~bigfsm[j][i]; 316 bigfsm[j][i] &= ~QBSBIT; 317 } 318 bigfsm[EOB][i] = ~S_EOB; 319 if (bigfsm[EOFC][i] >= 0) 320 bigfsm[EOFC][i] = ~S_EOF; 321 } 322 } 323 324 void 325 fixlex(void) 326 { 327 /* do C++ comments? */ 328 if ((Cplusplus == 0) || (Cflag != 0)) 329 bigfsm['/'][COM1] = bigfsm['x'][COM1]; 330 } 331 332 /* 333 * fill in a row of tokens from input, terminated by NL or END 334 * First token is put at trp->lp. 335 * Reset is non-zero when the input buffer can be "rewound." 336 * The value is a flag indicating that possible macros have 337 * been seen in the row. 338 */ 339 int 340 gettokens(Tokenrow * trp, int reset) 341 { 342 register int c, state, oldstate; 343 register uchar *ip; 344 register Token *tp, *maxp; 345 int runelen; 346 Source *s = cursource; 347 int nmac = 0; 348 349 tp = trp->lp; 350 ip = s->inp; 351 if (reset) 352 { 353 s->lineinc = 0; 354 if (ip >= s->inl) 355 { /* nothing in buffer */ 356 s->inl = s->inb; 357 fillbuf(s); 358 ip = s->inp = s->inb; 359 } 360 else 361 if (ip >= s->inb + (3 * INS / 4)) 362 { 363 memmove(s->inb, ip, 4 + s->inl - ip); 364 s->inl = s->inb + (s->inl - ip); 365 ip = s->inp = s->inb; 366 } 367 } 368 maxp = &trp->bp[trp->max]; 369 runelen = 1; 370 for (;;) 371 { 372 continue2: 373 if (tp >= maxp) 374 { 375 trp->lp = tp; 376 tp = growtokenrow(trp); 377 maxp = &trp->bp[trp->max]; 378 } 379 tp->type = UNCLASS; 380 tp->t = ip; 381 tp->wslen = 0; 382 tp->flag = 0; 383 state = START; 384 for (;;) 385 { 386 oldstate = state; 387 388 c = *ip; 389 390 if ((state = bigfsm[c][state]) >= 0) 391 { 392 ip += runelen; 393 runelen = 1; 394 continue; 395 } 396 state = ~state; 397 reswitch: 398 switch (state & 0177) 399 { 400 case S_SELF: 401 ip += runelen; 402 runelen = 1; 403 case S_SELFB: 404 tp->type = (unsigned char) GETACT(state); 405 tp->len = ip - tp->t; 406 tp++; 407 goto continue2; 408 409 case S_NAME: /* like S_SELFB but with nmac check */ 410 tp->type = NAME; 411 tp->len = ip - tp->t; 412 nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0); 413 tp++; 414 goto continue2; 415 416 case S_WS: 417 tp->wslen = ip - tp->t; 418 tp->t = ip; 419 state = START; 420 continue; 421 422 default: 423 if ((state & QBSBIT) == 0) 424 { 425 ip += runelen; 426 runelen = 1; 427 continue; 428 } 429 state &= ~QBSBIT; 430 s->inp = ip; 431 432 if (c == '\n') 433 { 434 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); 435 436 if (s->inp[1] == '\r') 437 { 438 memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2); 439 s->inl -= 1; 440 } 441 442 goto reswitch; 443 } 444 445 if (c == '\r') 446 { 447 while (s->inp + 1 >= s->inl && fillbuf(s) != EOF); 448 449 if (s->inp[1] == '\n') 450 { 451 memmove(s->inp, s->inp + 1, s->inl - s->inp + 1); 452 s->inl -= 1; 453 } 454 else 455 *s->inp = '\n'; 456 457 state = oldstate; 458 continue; 459 } 460 461 if (c == '?') 462 { /* check trigraph */ 463 if (trigraph(s)) 464 { 465 state = oldstate; 466 continue; 467 } 468 goto reswitch; 469 } 470 if (c == '\\') 471 { /* line-folding */ 472 if (foldline(s)) 473 { 474 s->lineinc++; 475 state = oldstate; 476 continue; 477 } 478 goto reswitch; 479 } 480 error(WARNING, "Lexical botch in cpp"); 481 ip += runelen; 482 runelen = 1; 483 continue; 484 485 case S_EOB: 486 s->inp = ip; 487 fillbuf(cursource); 488 state = oldstate; 489 continue; 490 491 case S_EOF: 492 tp->type = END; 493 tp->len = 0; 494 s->inp = ip; 495 if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1) 496 error(WARNING, "No newline at end of file"); 497 trp->lp = tp + 1; 498 return nmac; 499 500 case S_STNL: 501 error(ERROR, "Unterminated string or char const"); 502 case S_NL: 503 tp->t = ip; 504 tp->type = NL; 505 tp->len = 1; 506 tp->wslen = 0; 507 s->lineinc++; 508 s->inp = ip + 1; 509 trp->lp = tp + 1; 510 return nmac; 511 512 case S_EOFSTR: 513 error(FATAL, "EOF in string or char constant"); 514 break; 515 516 case S_COMNL: 517 s->lineinc++; 518 state = COM2; 519 ip += runelen; 520 runelen = 1; 521 continue; 522 523 case S_EOFCOM: 524 error(WARNING, "EOF inside comment"); 525 --ip; 526 case S_COMMENT: 527 if (!Cflag) 528 { 529 tp->t = ++ip; 530 tp->t[-1] = ' '; 531 tp->wslen = 1; 532 state = START; 533 continue; 534 } 535 else 536 { 537 runelen = 1; 538 s->lineinc = 0;; 539 tp->type = COMMENT; 540 tp->flag |= XTWS; 541 } 542 } 543 break; 544 } 545 ip += runelen; 546 runelen = 1; 547 tp->len = ip - tp->t; 548 tp++; 549 } 550 } 551 552 /* have seen ?; handle the trigraph it starts (if any) else 0 */ 553 int 554 trigraph(Source * s) 555 { 556 uchar c; 557 558 while (s->inp + 2 >= s->inl && fillbuf(s) != EOF); 559 ; 560 if (s->inp[1] != '?') 561 return 0; 562 c = 0; 563 switch (s->inp[2]) 564 { 565 case '=': 566 c = '#'; 567 break; 568 case '(': 569 c = '['; 570 break; 571 case '/': 572 c = '\\'; 573 break; 574 case ')': 575 c = ']'; 576 break; 577 case '\'': 578 c = '^'; 579 break; 580 case '<': 581 c = '{'; 582 break; 583 case '!': 584 c = '|'; 585 break; 586 case '>': 587 c = '}'; 588 break; 589 case '-': 590 c = '~'; 591 break; 592 } 593 if (c) 594 { 595 *s->inp = c; 596 memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2); 597 s->inl -= 2; 598 } 599 return c; 600 } 601 602 int 603 foldline(Source * s) 604 { 605 int n = 1; 606 607 /* skip pending wihite spaces */ 608 while ((s->inp[n] == ' ') || (s->inp[n] == '\t')) 609 { 610 n++; 611 if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF)) 612 break; 613 } 614 615 /* refill buffer */ 616 while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF); 617 618 /* skip DOS line ends */ 619 if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) || 620 ((s->inp[n] == '\n') && (s->inp[n+1] == '\r'))) 621 n++; 622 623 if ((s->inp[n] == '\n') || (s->inp[n] == '\r')) 624 { 625 memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2); 626 s->inl -= n + 1; 627 return 1; 628 } 629 return 0; 630 } 631 632 int 633 fillbuf(Source * s) 634 { 635 int n; 636 637 if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0) 638 n = 0; 639 s->inl += n; 640 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB; 641 if (n == 0) 642 { 643 s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC; 644 return EOF; 645 } 646 return 0; 647 } 648 649 /* 650 * Push down to new source of characters. 651 * If fd>0 and str==NULL, then from a file `name'; 652 * if fd==-1 and str, then from the string. 653 */ 654 Source * 655 setsource(char *name, int path, int fd, char *str, int wrap) 656 { 657 Source *s = new(Source); 658 int len; 659 660 s->line = 1; 661 s->lineinc = 0; 662 s->fd = fd; 663 s->filename = name; 664 s->next = cursource; 665 s->ifdepth = 0; 666 s->pathdepth = path; 667 s->wrap = wrap; 668 669 cursource = s; 670 671 if (s->wrap) 672 genwrap(0); 673 674 /* slop at right for EOB */ 675 if (str) 676 { 677 len = strlen(str); 678 s->inb = domalloc(len + 4); 679 s->inp = s->inb; 680 strncpy((char *) s->inp, str, len); 681 } 682 else 683 { 684 s->inb = domalloc(INS + 4); 685 s->inp = s->inb; 686 len = 0; 687 } 688 s->inl = s->inp + len; 689 s->inl[0] = s->inl[1] = EOB; 690 691 return s; 692 } 693 694 void 695 unsetsource(void) 696 { 697 Source *s = cursource; 698 699 if (s->wrap) 700 genwrap(1); 701 702 if (s->fd >= 0) 703 { 704 close(s->fd); 705 dofree(s->inb); 706 } 707 cursource = s->next; 708 dofree(s); 709 } 710