xref: /trunk/main/soltools/cpp/_lex.c (revision cdf0e10c)
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
5 #include <io.h>
6 #else
7 #include <unistd.h>
8 #endif
9 #include "cpp.h"
10 /*
11  * lexical FSM encoding
12  *   when in state state, and one of the characters
13  *   in ch arrives, enter nextstate.
14  *   States >= S_SELF are either final, or at least require special action.
15  *   In 'fsm' there is a line for each state X charset X nextstate.
16  *   List chars that overwrite previous entries later (e.g. C_ALPH
17  *   can be overridden by '_' by a later entry; and C_XX is the
18  *   the universal set, and should always be first.
19  *   States above S_SELF are represented in the big table as negative values.
20  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
21  *   These actions differ in that S_SELF doesn't have a lookahead char,
22  *   S_SELFB does.
23  *
24  *   The encoding is blown out into a big table for time-efficiency.
25  *   Entries have
26  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
27  */
28 
29 #define	MAXSTATE		32
30 #define	ACT(tok,act)	((tok<<7)+act)
31 #define	QBSBIT			0100
32 #define	GETACT(st)		((st>>7)&0x1ff)
33 
34 /* character classes */
35 #define	C_WS	1
36 #define	C_ALPH	2
37 #define	C_NUM	3
38 #define	C_EOF	4
39 #define	C_XX	5
40 
41 enum state
42 {
43     START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
44     CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
45     CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
46     S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
47     S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
48 };
49 
50 int tottok;
51 int tokkind[256];
52 struct fsm
53 {
54     int state;                          /* if in this state */
55     uchar ch[4];                        /* and see one of these characters */
56     int nextstate;                      /* enter this state if +ve */
57 };
58 
59  /*const*/ struct fsm fsm[] = {
60     /* start state */
61 		 {START, {C_XX}, ACT(UNCLASS, S_SELF)},
62 		 {START, {' ', '\t', '\v'}, WS1},
63 		 {START, {C_NUM}, NUM1},
64 		 {START, {'.'}, NUM3},
65 		 {START, {C_ALPH}, ID1},
66 		 {START, {'L'}, ST1},
67 		 {START, {'"'}, ST2},
68 		 {START, {'\''}, CC1},
69 		 {START, {'/'}, COM1},
70 		 {START, {EOFC}, S_EOF},
71 		 {START, {'\n'}, S_NL},
72 		 {START, {'-'}, MINUS1},
73 		 {START, {'+'}, PLUS1},
74 		 {START, {'<'}, LT1},
75 		 {START, {'>'}, GT1},
76 		 {START, {'='}, ASG1},
77 		 {START, {'!'}, NOT1},
78 		 {START, {'&'}, AND1},
79 		 {START, {'|'}, OR1},
80 		 {START, {'#'}, SHARP1},
81 		 {START, {'%'}, PCT1},
82 		 {START, {'['}, ACT(SBRA, S_SELF)},
83 		 {START, {']'}, ACT(SKET, S_SELF)},
84 		 {START, {'('}, ACT(LP, S_SELF)},
85 		 {START, {')'}, ACT(RP, S_SELF)},
86 		 {START, {'*'}, STAR1},
87 		 {START, {','}, ACT(COMMA, S_SELF)},
88 		 {START, {'?'}, ACT(QUEST, S_SELF)},
89 		 {START, {':'}, ACT(COLON, S_SELF)},
90 		 {START, {';'}, ACT(SEMIC, S_SELF)},
91 		 {START, {'{'}, ACT(CBRA, S_SELF)},
92 		 {START, {'}'}, ACT(CKET, S_SELF)},
93 		 {START, {'~'}, ACT(TILDE, S_SELF)},
94 		 {START, {'^'}, CIRC1},
95 
96     /* saw a digit */
97 		 {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)},
98 		 {NUM1, {C_NUM, C_ALPH, '.'}, NUM1},
99 		 {NUM1, {'E', 'e'}, NUM2},
100 		 {NUM1, {'_'}, ACT(NUMBER, S_SELFB)},
101 
102     /* saw possible start of exponent, digits-e */
103 		 {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)},
104 		 {NUM2, {'+', '-'}, NUM1},
105 		 {NUM2, {C_NUM, C_ALPH}, NUM1},
106 		 {NUM2, {'_'}, ACT(NUMBER, S_SELFB)},
107 
108     /* saw a '.', which could be a number or an operator */
109 		 {NUM3, {C_XX}, ACT(DOT, S_SELFB)},
110 		 {NUM3, {'.'}, DOTS1},
111 		 {NUM3, {C_NUM}, NUM1},
112 
113 		 {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)},
114 		 {DOTS1, {C_NUM}, NUM1},
115 		 {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)},
116 
117     /* saw a letter or _ */
118 		 {ID1, {C_XX}, ACT(NAME, S_NAME)},
119 		 {ID1, {C_ALPH, C_NUM}, ID1},
120 
121     /* saw L (start of wide string?) */
122 		 {ST1, {C_XX}, ACT(NAME, S_NAME)},
123 		 {ST1, {C_ALPH, C_NUM}, ID1},
124 		 {ST1, {'"'}, ST2},
125 		 {ST1, {'\''}, CC1},
126 
127     /* saw " beginning string */
128 		 {ST2, {C_XX}, ST2},
129 		 {ST2, {'"'}, ACT(STRING, S_SELF)},
130 		 {ST2, {'\\'}, ST3},
131 		 {ST2, {'\n'}, S_STNL},
132 		 {ST2, {EOFC}, S_EOFSTR},
133 
134     /* saw \ in string */
135 		 {ST3, {C_XX}, ST2},
136 		 {ST3, {'\n'}, S_STNL},
137 		 {ST3, {EOFC}, S_EOFSTR},
138 
139     /* saw ' beginning character const */
140 		 {CC1, {C_XX}, CC1},
141 		 {CC1, {'\''}, ACT(CCON, S_SELF)},
142 		 {CC1, {'\\'}, CC2},
143 		 {CC1, {'\n'}, S_STNL},
144 		 {CC1, {EOFC}, S_EOFSTR},
145 
146     /* saw \ in ccon */
147 		 {CC2, {C_XX}, CC1},
148 		 {CC2, {'\n'}, S_STNL},
149 		 {CC2, {EOFC}, S_EOFSTR},
150 
151     /* saw /, perhaps start of comment */
152 		 {COM1, {C_XX}, ACT(SLASH, S_SELFB)},
153 		 {COM1, {'='}, ACT(ASSLASH, S_SELF)},
154 		 {COM1, {'*'}, COM2},
155 		 {COM1, {'/'}, COM4},
156 
157     /* saw / followed by *, start of comment */
158 		 {COM2, {C_XX}, COM2},
159 		 {COM2, {'\n'}, S_COMNL},
160 		 {COM2, {'*'}, COM3},
161 		 {COM2, {EOFC}, S_EOFCOM},
162 
163     /* saw the * possibly ending a comment */
164 		 {COM3, {C_XX}, COM2},
165 		 {COM3, {'\n'}, S_COMNL},
166 		 {COM3, {'*'}, COM3},
167 		 {COM3, {'/'}, S_COMMENT},
168 
169     /* // comment */
170 		 {COM4, {C_XX}, COM4},
171 		 {COM4, {'\n'}, S_NL},
172 		 {COM4, {EOFC}, S_EOFCOM},
173 
174     /* saw white space, eat it up */
175 		 {WS1, {C_XX}, S_WS},
176 		 {WS1, {'\t', '\v', ' '}, WS1},
177 
178     /* saw -, check --, -=, -> */
179 		 {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)},
180 		 {MINUS1, {'-'}, ACT(MMINUS, S_SELF)},
181 		 {MINUS1, {'='}, ACT(ASMINUS, S_SELF)},
182 		 {MINUS1, {'>'}, ACT(ARROW, S_SELF)},
183 
184     /* saw +, check ++, += */
185 		 {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)},
186 		 {PLUS1, {'+'}, ACT(PPLUS, S_SELF)},
187 		 {PLUS1, {'='}, ACT(ASPLUS, S_SELF)},
188 
189     /* saw <, check <<, <<=, <= */
190 		 {LT1, {C_XX}, ACT(LT, S_SELFB)},
191 		 {LT1, {'<'}, LT2},
192 		 {LT1, {'='}, ACT(LEQ, S_SELF)},
193 		 {LT2, {C_XX}, ACT(LSH, S_SELFB)},
194 		 {LT2, {'='}, ACT(ASLSH, S_SELF)},
195 
196     /* saw >, check >>, >>=, >= */
197 		 {GT1, {C_XX}, ACT(GT, S_SELFB)},
198 		 {GT1, {'>'}, GT2},
199 		 {GT1, {'='}, ACT(GEQ, S_SELF)},
200 		 {GT2, {C_XX}, ACT(RSH, S_SELFB)},
201 		 {GT2, {'='}, ACT(ASRSH, S_SELF)},
202 
203     /* = */
204 		 {ASG1, {C_XX}, ACT(ASGN, S_SELFB)},
205 		 {ASG1, {'='}, ACT(EQ, S_SELF)},
206 
207     /* ! */
208 		 {NOT1, {C_XX}, ACT(NOT, S_SELFB)},
209 		 {NOT1, {'='}, ACT(NEQ, S_SELF)},
210 
211     /* & */
212 		 {AND1, {C_XX}, ACT(AND, S_SELFB)},
213 		 {AND1, {'&'}, ACT(LAND, S_SELF)},
214 		 {AND1, {'='}, ACT(ASAND, S_SELF)},
215 
216     /* | */
217 		 {OR1, {C_XX}, ACT(OR, S_SELFB)},
218 		 {OR1, {'|'}, ACT(LOR, S_SELF)},
219 		 {OR1, {'='}, ACT(ASOR, S_SELF)},
220 
221     /* # */
222 		 {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)},
223 		 {SHARP1, {'#'}, ACT(DSHARP, S_SELF)},
224 
225     /* % */
226 		 {PCT1, {C_XX}, ACT(PCT, S_SELFB)},
227 		 {PCT1, {'='}, ACT(ASPCT, S_SELF)},
228 
229     /* * */
230 		 {STAR1, {C_XX}, ACT(STAR, S_SELFB)},
231 		 {STAR1, {'='}, ACT(ASSTAR, S_SELF)},
232 
233     /* ^ */
234 		 {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)},
235 		 {CIRC1, {'='}, ACT(ASCIRC, S_SELF)},
236 
237 		 {-1, "", 0}
238 };
239 
240 /* first index is char, second is state */
241 /* increase #states to power of 2 to encourage use of shift */
242 short bigfsm[256][MAXSTATE];
243 
244 void
245     expandlex(void)
246 {
247      /* const */ struct fsm *fp;
248     int i, j, nstate;
249 
250     for (fp = fsm; fp->state >= 0; fp++)
251     {
252         for (i = 0; fp->ch[i]; i++)
253         {
254             nstate = fp->nextstate;
255             if (nstate >= S_SELF)
256                 nstate = ~nstate;
257             switch (fp->ch[i])
258             {
259 
260                 case C_XX:              /* random characters */
261                     for (j = 0; j < 256; j++)
262                         bigfsm[j][fp->state] = (short) nstate;
263                     continue;
264                 case C_ALPH:
265                     for (j = 0; j < 256; j++)
266 #ifdef S390
267 						if( isalpha( j ) || (j == '_') )
268 #else
269                         if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z')
270                             || j == '_')
271 #endif
272                             bigfsm[j][fp->state] = (short) nstate;
273                     continue;
274                 case C_NUM:
275                     for (j = '0'; j <= '9'; j++)
276                         bigfsm[j][fp->state] = (short) nstate;
277                     continue;
278                 default:
279                     bigfsm[fp->ch[i]][fp->state] = (short) nstate;
280             }
281         }
282     }
283 
284     /*
285      * install special cases for ? (trigraphs),  \ (splicing), runes, and
286      * EOB
287      */
288     for (i = 0; i < MAXSTATE; i++)
289     {
290         for (j = 0; j < 0xFF; j++)
291             if (j == '?' || j == '\\' || j == '\n' || j == '\r')
292             {
293                 if (bigfsm[j][i] > 0)
294                     bigfsm[j][i] = ~bigfsm[j][i];
295                 bigfsm[j][i] &= ~QBSBIT;
296             }
297         bigfsm[EOB][i] = ~S_EOB;
298         if (bigfsm[EOFC][i] >= 0)
299             bigfsm[EOFC][i] = ~S_EOF;
300     }
301 }
302 
303 void
304     fixlex(void)
305 {
306     /* do C++ comments? */
307     if ((Cplusplus == 0) || (Cflag != 0))
308         bigfsm['/'][COM1] = bigfsm['x'][COM1];
309 }
310 
311 /*
312  * fill in a row of tokens from input, terminated by NL or END
313  * First token is put at trp->lp.
314  * Reset is non-zero when the input buffer can be "rewound."
315  * The value is a flag indicating that possible macros have
316  * been seen in the row.
317  */
318 int
319     gettokens(Tokenrow * trp, int reset)
320 {
321     register int c, state, oldstate;
322     register uchar *ip;
323     register Token *tp, *maxp;
324     int runelen;
325     Source *s = cursource;
326     int nmac = 0;
327 
328     tp = trp->lp;
329     ip = s->inp;
330     if (reset)
331     {
332         s->lineinc = 0;
333         if (ip >= s->inl)
334         {                               /* nothing in buffer */
335             s->inl = s->inb;
336             fillbuf(s);
337             ip = s->inp = s->inb;
338         }
339         else
340             if (ip >= s->inb + (3 * INS / 4))
341             {
342                 memmove(s->inb, ip, 4 + s->inl - ip);
343                 s->inl = s->inb + (s->inl - ip);
344                 ip = s->inp = s->inb;
345             }
346     }
347     maxp = &trp->bp[trp->max];
348     runelen = 1;
349     for (;;)
350     {
351 continue2:
352         if (tp >= maxp)
353         {
354             trp->lp = tp;
355             tp = growtokenrow(trp);
356             maxp = &trp->bp[trp->max];
357         }
358         tp->type = UNCLASS;
359         tp->t = ip;
360         tp->wslen = 0;
361         tp->flag = 0;
362         state = START;
363         for (;;)
364         {
365             oldstate = state;
366 
367             c = *ip;
368 
369             if ((state = bigfsm[c][state]) >= 0)
370             {
371                 ip += runelen;
372                 runelen = 1;
373                 continue;
374             }
375             state = ~state;
376     reswitch:
377             switch (state & 0177)
378             {
379                 case S_SELF:
380                     ip += runelen;
381                     runelen = 1;
382                 case S_SELFB:
383                     tp->type = (unsigned char) GETACT(state);
384                     tp->len = ip - tp->t;
385                     tp++;
386                     goto continue2;
387 
388                 case S_NAME:            /* like S_SELFB but with nmac check */
389                     tp->type = NAME;
390                     tp->len = ip - tp->t;
391                     nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0);
392                     tp++;
393                     goto continue2;
394 
395                 case S_WS:
396                     tp->wslen = ip - tp->t;
397                     tp->t = ip;
398                     state = START;
399                     continue;
400 
401                 default:
402                     if ((state & QBSBIT) == 0)
403                     {
404                         ip += runelen;
405                         runelen = 1;
406                         continue;
407                     }
408                     state &= ~QBSBIT;
409                     s->inp = ip;
410 
411 					if (c == '\n')
412 					{
413 					    while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
414 
415 						if (s->inp[1] == '\r')
416 						{
417 							memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2);
418 							s->inl -= 1;
419 						}
420 
421                         goto reswitch;
422 					}
423 
424 					if (c == '\r')
425 					{
426     				    while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
427 
428 						if (s->inp[1] == '\n')
429 						{
430 							memmove(s->inp, s->inp + 1, s->inl - s->inp + 1);
431 							s->inl -= 1;
432 						}
433 						else
434 							*s->inp = '\n';
435 
436 						state = oldstate;
437                         continue;
438 					}
439 
440                     if (c == '?')
441                     {                   /* check trigraph */
442                         if (trigraph(s))
443                         {
444                             state = oldstate;
445                             continue;
446                         }
447                         goto reswitch;
448                     }
449                     if (c == '\\')
450                     {                   /* line-folding */
451                         if (foldline(s))
452                         {
453                             s->lineinc++;
454                             state = oldstate;
455                             continue;
456                         }
457                         goto reswitch;
458                     }
459                     error(WARNING, "Lexical botch in cpp");
460                     ip += runelen;
461                     runelen = 1;
462                     continue;
463 
464                 case S_EOB:
465                     s->inp = ip;
466                     fillbuf(cursource);
467                     state = oldstate;
468                     continue;
469 
470                 case S_EOF:
471                     tp->type = END;
472                     tp->len = 0;
473                     s->inp = ip;
474                     if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1)
475                         error(WARNING, "No newline at end of file");
476                     trp->lp = tp + 1;
477                     return nmac;
478 
479                 case S_STNL:
480                     error(ERROR, "Unterminated string or char const");
481                 case S_NL:
482                     tp->t = ip;
483                     tp->type = NL;
484                     tp->len = 1;
485                     tp->wslen = 0;
486                     s->lineinc++;
487                     s->inp = ip + 1;
488                     trp->lp = tp + 1;
489                     return nmac;
490 
491                 case S_EOFSTR:
492                     error(FATAL, "EOF in string or char constant");
493                     break;
494 
495                 case S_COMNL:
496                     s->lineinc++;
497                     state = COM2;
498                     ip += runelen;
499                     runelen = 1;
500                     continue;
501 
502                 case S_EOFCOM:
503                     error(WARNING, "EOF inside comment");
504                     --ip;
505                 case S_COMMENT:
506 					if (!Cflag)
507 					{
508 						tp->t = ++ip;
509 						tp->t[-1] = ' ';
510 						tp->wslen = 1;
511 						state = START;
512 						continue;
513 					}
514 					else
515 					{
516 	                    runelen = 1;
517                         s->lineinc = 0;;
518                         tp->type = COMMENT;
519 						tp->flag |= XTWS;
520 					}
521             }
522             break;
523         }
524         ip += runelen;
525         runelen = 1;
526         tp->len = ip - tp->t;
527         tp++;
528     }
529 }
530 
531 /* have seen ?; handle the trigraph it starts (if any) else 0 */
532 int
533     trigraph(Source * s)
534 {
535     uchar c;
536 
537     while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
538 	;
539     if (s->inp[1] != '?')
540         return 0;
541     c = 0;
542     switch (s->inp[2])
543     {
544         case '=':
545             c = '#';
546             break;
547         case '(':
548             c = '[';
549             break;
550         case '/':
551             c = '\\';
552             break;
553         case ')':
554             c = ']';
555             break;
556         case '\'':
557             c = '^';
558             break;
559         case '<':
560             c = '{';
561             break;
562         case '!':
563             c = '|';
564             break;
565         case '>':
566             c = '}';
567             break;
568         case '-':
569             c = '~';
570             break;
571     }
572     if (c)
573     {
574         *s->inp = c;
575         memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2);
576         s->inl -= 2;
577     }
578     return c;
579 }
580 
581 int
582     foldline(Source * s)
583 {
584     int n = 1;
585 
586 	/* skip pending wihite spaces */
587 	while ((s->inp[n] == ' ') || (s->inp[n] == '\t'))
588 	{
589 		n++;
590 	    if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF))
591 			break;
592 	}
593 
594 	/* refill buffer */
595     while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF);
596 
597     /* skip DOS line ends */
598     if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) ||
599 		((s->inp[n] == '\n') && (s->inp[n+1] == '\r')))
600         n++;
601 
602     if ((s->inp[n] == '\n') || (s->inp[n] == '\r'))
603     {
604         memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2);
605         s->inl -= n + 1;
606         return 1;
607     }
608     return 0;
609 }
610 
611 int
612     fillbuf(Source * s)
613 {
614     int n;
615 
616     if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0)
617         n = 0;
618     s->inl += n;
619     s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB;
620     if (n == 0)
621     {
622         s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC;
623         return EOF;
624     }
625     return 0;
626 }
627 
628 /*
629  * Push down to new source of characters.
630  * If fd>0 and str==NULL, then from a file `name';
631  * if fd==-1 and str, then from the string.
632  */
633 Source *
634     setsource(char *name, int path, int fd, char *str, int wrap)
635 {
636     Source *s = new(Source);
637     int len;
638 
639     s->line = 1;
640     s->lineinc = 0;
641     s->fd = fd;
642     s->filename = name;
643     s->next = cursource;
644     s->ifdepth = 0;
645     s->pathdepth = path;
646 	s->wrap = wrap;
647 
648     cursource = s;
649 
650 	if (s->wrap)
651 		genwrap(0);
652 
653     /* slop at right for EOB */
654     if (str)
655     {
656         len = strlen(str);
657         s->inb = domalloc(len + 4);
658         s->inp = s->inb;
659         strncpy((char *) s->inp, str, len);
660     }
661     else
662     {
663         s->inb = domalloc(INS + 4);
664         s->inp = s->inb;
665         len = 0;
666     }
667     s->inl = s->inp + len;
668     s->inl[0] = s->inl[1] = EOB;
669 
670     return s;
671 }
672 
673 void
674     unsetsource(void)
675 {
676     Source *s = cursource;
677 
678 	if (s->wrap)
679 		genwrap(1);
680 
681     if (s->fd >= 0)
682     {
683         close(s->fd);
684         dofree(s->inb);
685     }
686     cursource = s->next;
687     dofree(s);
688 }
689