xref: /trunk/main/soltools/cpp/_lex.c (revision 7ce20373)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #if (defined(_WIN32) || defined(_MSDOS) || defined(__IBMC__))
26 #include <io.h>
27 #else
28 #include <unistd.h>
29 #endif
30 #include "cpp.h"
31 /*
32  * lexical FSM encoding
33  *   when in state state, and one of the characters
34  *   in ch arrives, enter nextstate.
35  *   States >= S_SELF are either final, or at least require special action.
36  *   In 'fsm' there is a line for each state X charset X nextstate.
37  *   List chars that overwrite previous entries later (e.g. C_ALPH
38  *   can be overridden by '_' by a later entry; and C_XX is the
39  *   the universal set, and should always be first.
40  *   States above S_SELF are represented in the big table as negative values.
41  *   S_SELF and S_SELFB encode the resulting token type in the upper bits.
42  *   These actions differ in that S_SELF doesn't have a lookahead char,
43  *   S_SELFB does.
44  *
45  *   The encoding is blown out into a big table for time-efficiency.
46  *   Entries have
47  *      nextstate: 6 bits; ?\ marker: 1 bit; tokentype: 9 bits.
48  */
49 
50 #define	MAXSTATE		32
51 #define	ACT(tok,act)	((tok<<7)+act)
52 #define	QBSBIT			0100
53 #define	GETACT(st)		((st>>7)&0x1ff)
54 
55 /* character classes */
56 #define	C_WS	1
57 #define	C_ALPH	2
58 #define	C_NUM	3
59 #define	C_EOF	4
60 #define	C_XX	5
61 
62 enum state
63 {
64     START = 0, NUM1, NUM2, NUM3, ID1, ST1, ST2, ST3, COM1, COM2, COM3, COM4,
65     CC1, CC2, WS1, PLUS1, MINUS1, STAR1, SLASH1, PCT1, SHARP1,
66     CIRC1, GT1, GT2, LT1, LT2, OR1, AND1, ASG1, NOT1, DOTS1,
67     S_SELF = MAXSTATE, S_SELFB, S_EOF, S_NL, S_EOFSTR,
68     S_STNL, S_COMNL, S_EOFCOM, S_COMMENT, S_EOB, S_WS, S_NAME
69 };
70 
71 int tottok;
72 int tokkind[256];
73 struct fsm
74 {
75     int state;                          /* if in this state */
76     uchar ch[4];                        /* and see one of these characters */
77     int nextstate;                      /* enter this state if +ve */
78 };
79 
80  /*const*/ struct fsm fsm[] = {
81     /* start state */
82 		 {START, {C_XX}, ACT(UNCLASS, S_SELF)},
83 		 {START, {' ', '\t', '\v'}, WS1},
84 		 {START, {C_NUM}, NUM1},
85 		 {START, {'.'}, NUM3},
86 		 {START, {C_ALPH}, ID1},
87 		 {START, {'L'}, ST1},
88 		 {START, {'"'}, ST2},
89 		 {START, {'\''}, CC1},
90 		 {START, {'/'}, COM1},
91 		 {START, {EOFC}, S_EOF},
92 		 {START, {'\n'}, S_NL},
93 		 {START, {'-'}, MINUS1},
94 		 {START, {'+'}, PLUS1},
95 		 {START, {'<'}, LT1},
96 		 {START, {'>'}, GT1},
97 		 {START, {'='}, ASG1},
98 		 {START, {'!'}, NOT1},
99 		 {START, {'&'}, AND1},
100 		 {START, {'|'}, OR1},
101 		 {START, {'#'}, SHARP1},
102 		 {START, {'%'}, PCT1},
103 		 {START, {'['}, ACT(SBRA, S_SELF)},
104 		 {START, {']'}, ACT(SKET, S_SELF)},
105 		 {START, {'('}, ACT(LP, S_SELF)},
106 		 {START, {')'}, ACT(RP, S_SELF)},
107 		 {START, {'*'}, STAR1},
108 		 {START, {','}, ACT(COMMA, S_SELF)},
109 		 {START, {'?'}, ACT(QUEST, S_SELF)},
110 		 {START, {':'}, ACT(COLON, S_SELF)},
111 		 {START, {';'}, ACT(SEMIC, S_SELF)},
112 		 {START, {'{'}, ACT(CBRA, S_SELF)},
113 		 {START, {'}'}, ACT(CKET, S_SELF)},
114 		 {START, {'~'}, ACT(TILDE, S_SELF)},
115 		 {START, {'^'}, CIRC1},
116 
117     /* saw a digit */
118 		 {NUM1, {C_XX}, ACT(NUMBER, S_SELFB)},
119 		 {NUM1, {C_NUM, C_ALPH, '.'}, NUM1},
120 		 {NUM1, {'E', 'e'}, NUM2},
121 		 {NUM1, {'_'}, ACT(NUMBER, S_SELFB)},
122 
123     /* saw possible start of exponent, digits-e */
124 		 {NUM2, {C_XX}, ACT(NUMBER, S_SELFB)},
125 		 {NUM2, {'+', '-'}, NUM1},
126 		 {NUM2, {C_NUM, C_ALPH}, NUM1},
127 		 {NUM2, {'_'}, ACT(NUMBER, S_SELFB)},
128 
129     /* saw a '.', which could be a number or an operator */
130 		 {NUM3, {C_XX}, ACT(DOT, S_SELFB)},
131 		 {NUM3, {'.'}, DOTS1},
132 		 {NUM3, {C_NUM}, NUM1},
133 
134 		 {DOTS1, {C_XX}, ACT(UNCLASS, S_SELFB)},
135 		 {DOTS1, {C_NUM}, NUM1},
136 		 {DOTS1, {'.'}, ACT(ELLIPS, S_SELF)},
137 
138     /* saw a letter or _ */
139 		 {ID1, {C_XX}, ACT(NAME, S_NAME)},
140 		 {ID1, {C_ALPH, C_NUM}, ID1},
141 
142     /* saw L (start of wide string?) */
143 		 {ST1, {C_XX}, ACT(NAME, S_NAME)},
144 		 {ST1, {C_ALPH, C_NUM}, ID1},
145 		 {ST1, {'"'}, ST2},
146 		 {ST1, {'\''}, CC1},
147 
148     /* saw " beginning string */
149 		 {ST2, {C_XX}, ST2},
150 		 {ST2, {'"'}, ACT(STRING, S_SELF)},
151 		 {ST2, {'\\'}, ST3},
152 		 {ST2, {'\n'}, S_STNL},
153 		 {ST2, {EOFC}, S_EOFSTR},
154 
155     /* saw \ in string */
156 		 {ST3, {C_XX}, ST2},
157 		 {ST3, {'\n'}, S_STNL},
158 		 {ST3, {EOFC}, S_EOFSTR},
159 
160     /* saw ' beginning character const */
161 		 {CC1, {C_XX}, CC1},
162 		 {CC1, {'\''}, ACT(CCON, S_SELF)},
163 		 {CC1, {'\\'}, CC2},
164 		 {CC1, {'\n'}, S_STNL},
165 		 {CC1, {EOFC}, S_EOFSTR},
166 
167     /* saw \ in ccon */
168 		 {CC2, {C_XX}, CC1},
169 		 {CC2, {'\n'}, S_STNL},
170 		 {CC2, {EOFC}, S_EOFSTR},
171 
172     /* saw /, perhaps start of comment */
173 		 {COM1, {C_XX}, ACT(SLASH, S_SELFB)},
174 		 {COM1, {'='}, ACT(ASSLASH, S_SELF)},
175 		 {COM1, {'*'}, COM2},
176 		 {COM1, {'/'}, COM4},
177 
178     /* saw / followed by *, start of comment */
179 		 {COM2, {C_XX}, COM2},
180 		 {COM2, {'\n'}, S_COMNL},
181 		 {COM2, {'*'}, COM3},
182 		 {COM2, {EOFC}, S_EOFCOM},
183 
184     /* saw the * possibly ending a comment */
185 		 {COM3, {C_XX}, COM2},
186 		 {COM3, {'\n'}, S_COMNL},
187 		 {COM3, {'*'}, COM3},
188 		 {COM3, {'/'}, S_COMMENT},
189 
190     /* // comment */
191 		 {COM4, {C_XX}, COM4},
192 		 {COM4, {'\n'}, S_NL},
193 		 {COM4, {EOFC}, S_EOFCOM},
194 
195     /* saw white space, eat it up */
196 		 {WS1, {C_XX}, S_WS},
197 		 {WS1, {'\t', '\v', ' '}, WS1},
198 
199     /* saw -, check --, -=, -> */
200 		 {MINUS1, {C_XX}, ACT(MINUS, S_SELFB)},
201 		 {MINUS1, {'-'}, ACT(MMINUS, S_SELF)},
202 		 {MINUS1, {'='}, ACT(ASMINUS, S_SELF)},
203 		 {MINUS1, {'>'}, ACT(ARROW, S_SELF)},
204 
205     /* saw +, check ++, += */
206 		 {PLUS1, {C_XX}, ACT(PLUS, S_SELFB)},
207 		 {PLUS1, {'+'}, ACT(PPLUS, S_SELF)},
208 		 {PLUS1, {'='}, ACT(ASPLUS, S_SELF)},
209 
210     /* saw <, check <<, <<=, <= */
211 		 {LT1, {C_XX}, ACT(LT, S_SELFB)},
212 		 {LT1, {'<'}, LT2},
213 		 {LT1, {'='}, ACT(LEQ, S_SELF)},
214 		 {LT2, {C_XX}, ACT(LSH, S_SELFB)},
215 		 {LT2, {'='}, ACT(ASLSH, S_SELF)},
216 
217     /* saw >, check >>, >>=, >= */
218 		 {GT1, {C_XX}, ACT(GT, S_SELFB)},
219 		 {GT1, {'>'}, GT2},
220 		 {GT1, {'='}, ACT(GEQ, S_SELF)},
221 		 {GT2, {C_XX}, ACT(RSH, S_SELFB)},
222 		 {GT2, {'='}, ACT(ASRSH, S_SELF)},
223 
224     /* = */
225 		 {ASG1, {C_XX}, ACT(ASGN, S_SELFB)},
226 		 {ASG1, {'='}, ACT(EQ, S_SELF)},
227 
228     /* ! */
229 		 {NOT1, {C_XX}, ACT(NOT, S_SELFB)},
230 		 {NOT1, {'='}, ACT(NEQ, S_SELF)},
231 
232     /* & */
233 		 {AND1, {C_XX}, ACT(AND, S_SELFB)},
234 		 {AND1, {'&'}, ACT(LAND, S_SELF)},
235 		 {AND1, {'='}, ACT(ASAND, S_SELF)},
236 
237     /* | */
238 		 {OR1, {C_XX}, ACT(OR, S_SELFB)},
239 		 {OR1, {'|'}, ACT(LOR, S_SELF)},
240 		 {OR1, {'='}, ACT(ASOR, S_SELF)},
241 
242     /* # */
243 		 {SHARP1, {C_XX}, ACT(SHARP, S_SELFB)},
244 		 {SHARP1, {'#'}, ACT(DSHARP, S_SELF)},
245 
246     /* % */
247 		 {PCT1, {C_XX}, ACT(PCT, S_SELFB)},
248 		 {PCT1, {'='}, ACT(ASPCT, S_SELF)},
249 
250     /* * */
251 		 {STAR1, {C_XX}, ACT(STAR, S_SELFB)},
252 		 {STAR1, {'='}, ACT(ASSTAR, S_SELF)},
253 
254     /* ^ */
255 		 {CIRC1, {C_XX}, ACT(CIRC, S_SELFB)},
256 		 {CIRC1, {'='}, ACT(ASCIRC, S_SELF)},
257 
258 		 {-1, "", 0}
259 };
260 
261 /* first index is char, second is state */
262 /* increase #states to power of 2 to encourage use of shift */
263 short bigfsm[256][MAXSTATE];
264 
265 void
266     expandlex(void)
267 {
268      /* const */ struct fsm *fp;
269     int i, j, nstate;
270 
271     for (fp = fsm; fp->state >= 0; fp++)
272     {
273         for (i = 0; fp->ch[i]; i++)
274         {
275             nstate = fp->nextstate;
276             if (nstate >= S_SELF)
277                 nstate = ~nstate;
278             switch (fp->ch[i])
279             {
280 
281                 case C_XX:              /* random characters */
282                     for (j = 0; j < 256; j++)
283                         bigfsm[j][fp->state] = (short) nstate;
284                     continue;
285                 case C_ALPH:
286                     for (j = 0; j < 256; j++)
287 #ifdef S390
288 						if( isalpha( j ) || (j == '_') )
289 #else
290                         if (('a' <= j && j <= 'z') || ('A' <= j && j <= 'Z')
291                             || j == '_')
292 #endif
293                             bigfsm[j][fp->state] = (short) nstate;
294                     continue;
295                 case C_NUM:
296                     for (j = '0'; j <= '9'; j++)
297                         bigfsm[j][fp->state] = (short) nstate;
298                     continue;
299                 default:
300                     bigfsm[fp->ch[i]][fp->state] = (short) nstate;
301             }
302         }
303     }
304 
305     /*
306      * install special cases for ? (trigraphs),  \ (splicing), runes, and
307      * EOB
308      */
309     for (i = 0; i < MAXSTATE; i++)
310     {
311         for (j = 0; j < 0xFF; j++)
312             if (j == '?' || j == '\\' || j == '\n' || j == '\r')
313             {
314                 if (bigfsm[j][i] > 0)
315                     bigfsm[j][i] = ~bigfsm[j][i];
316                 bigfsm[j][i] &= ~QBSBIT;
317             }
318         bigfsm[EOB][i] = ~S_EOB;
319         if (bigfsm[EOFC][i] >= 0)
320             bigfsm[EOFC][i] = ~S_EOF;
321     }
322 }
323 
324 void
325     fixlex(void)
326 {
327     /* do C++ comments? */
328     if ((Cplusplus == 0) || (Cflag != 0))
329         bigfsm['/'][COM1] = bigfsm['x'][COM1];
330 }
331 
332 /*
333  * fill in a row of tokens from input, terminated by NL or END
334  * First token is put at trp->lp.
335  * Reset is non-zero when the input buffer can be "rewound."
336  * The value is a flag indicating that possible macros have
337  * been seen in the row.
338  */
339 int
340     gettokens(Tokenrow * trp, int reset)
341 {
342     register int c, state, oldstate;
343     register uchar *ip;
344     register Token *tp, *maxp;
345     int runelen;
346     Source *s = cursource;
347     int nmac = 0;
348 
349     tp = trp->lp;
350     ip = s->inp;
351     if (reset)
352     {
353         s->lineinc = 0;
354         if (ip >= s->inl)
355         {                               /* nothing in buffer */
356             s->inl = s->inb;
357             fillbuf(s);
358             ip = s->inp = s->inb;
359         }
360         else
361             if (ip >= s->inb + (3 * INS / 4))
362             {
363                 memmove(s->inb, ip, 4 + s->inl - ip);
364                 s->inl = s->inb + (s->inl - ip);
365                 ip = s->inp = s->inb;
366             }
367     }
368     maxp = &trp->bp[trp->max];
369     runelen = 1;
370     for (;;)
371     {
372 continue2:
373         if (tp >= maxp)
374         {
375             trp->lp = tp;
376             tp = growtokenrow(trp);
377             maxp = &trp->bp[trp->max];
378         }
379         tp->type = UNCLASS;
380         tp->t = ip;
381         tp->wslen = 0;
382         tp->flag = 0;
383         state = START;
384         for (;;)
385         {
386             oldstate = state;
387 
388             c = *ip;
389 
390             if ((state = bigfsm[c][state]) >= 0)
391             {
392                 ip += runelen;
393                 runelen = 1;
394                 continue;
395             }
396             state = ~state;
397     reswitch:
398             switch (state & 0177)
399             {
400                 case S_SELF:
401                     ip += runelen;
402                     runelen = 1;
403                 case S_SELFB:
404                     tp->type = (unsigned char) GETACT(state);
405                     tp->len = ip - tp->t;
406                     tp++;
407                     goto continue2;
408 
409                 case S_NAME:            /* like S_SELFB but with nmac check */
410                     tp->type = NAME;
411                     tp->len = ip - tp->t;
412                     nmac |= quicklook(tp->t[0], tp->len > 1 ? tp->t[1] : 0);
413                     tp++;
414                     goto continue2;
415 
416                 case S_WS:
417                     tp->wslen = ip - tp->t;
418                     tp->t = ip;
419                     state = START;
420                     continue;
421 
422                 default:
423                     if ((state & QBSBIT) == 0)
424                     {
425                         ip += runelen;
426                         runelen = 1;
427                         continue;
428                     }
429                     state &= ~QBSBIT;
430                     s->inp = ip;
431 
432 					if (c == '\n')
433 					{
434 					    while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
435 
436 						if (s->inp[1] == '\r')
437 						{
438 							memmove(s->inp + 1, s->inp + 2, s->inl - s->inp + 2);
439 							s->inl -= 1;
440 						}
441 
442                         goto reswitch;
443 					}
444 
445 					if (c == '\r')
446 					{
447     				    while (s->inp + 1 >= s->inl && fillbuf(s) != EOF);
448 
449 						if (s->inp[1] == '\n')
450 						{
451 							memmove(s->inp, s->inp + 1, s->inl - s->inp + 1);
452 							s->inl -= 1;
453 						}
454 						else
455 							*s->inp = '\n';
456 
457 						state = oldstate;
458                         continue;
459 					}
460 
461                     if (c == '?')
462                     {                   /* check trigraph */
463                         if (trigraph(s))
464                         {
465                             state = oldstate;
466                             continue;
467                         }
468                         goto reswitch;
469                     }
470                     if (c == '\\')
471                     {                   /* line-folding */
472                         if (foldline(s))
473                         {
474                             s->lineinc++;
475                             state = oldstate;
476                             continue;
477                         }
478                         goto reswitch;
479                     }
480                     error(WARNING, "Lexical botch in cpp");
481                     ip += runelen;
482                     runelen = 1;
483                     continue;
484 
485                 case S_EOB:
486                     s->inp = ip;
487                     fillbuf(cursource);
488                     state = oldstate;
489                     continue;
490 
491                 case S_EOF:
492                     tp->type = END;
493                     tp->len = 0;
494                     s->inp = ip;
495                     if (tp != trp->bp && (tp - 1)->type != NL && cursource->fd != -1)
496                         error(WARNING, "No newline at end of file");
497                     trp->lp = tp + 1;
498                     return nmac;
499 
500                 case S_STNL:
501                     error(ERROR, "Unterminated string or char const");
502                 case S_NL:
503                     tp->t = ip;
504                     tp->type = NL;
505                     tp->len = 1;
506                     tp->wslen = 0;
507                     s->lineinc++;
508                     s->inp = ip + 1;
509                     trp->lp = tp + 1;
510                     return nmac;
511 
512                 case S_EOFSTR:
513                     error(FATAL, "EOF in string or char constant");
514                     break;
515 
516                 case S_COMNL:
517                     s->lineinc++;
518                     state = COM2;
519                     ip += runelen;
520                     runelen = 1;
521                     continue;
522 
523                 case S_EOFCOM:
524                     error(WARNING, "EOF inside comment");
525                     --ip;
526                 case S_COMMENT:
527 					if (!Cflag)
528 					{
529 						tp->t = ++ip;
530 						tp->t[-1] = ' ';
531 						tp->wslen = 1;
532 						state = START;
533 						continue;
534 					}
535 					else
536 					{
537 	                    runelen = 1;
538                         s->lineinc = 0;;
539                         tp->type = COMMENT;
540 						tp->flag |= XTWS;
541 					}
542             }
543             break;
544         }
545         ip += runelen;
546         runelen = 1;
547         tp->len = ip - tp->t;
548         tp++;
549     }
550 }
551 
552 /* have seen ?; handle the trigraph it starts (if any) else 0 */
553 int
554     trigraph(Source * s)
555 {
556     uchar c;
557 
558     while (s->inp + 2 >= s->inl && fillbuf(s) != EOF);
559 	;
560     if (s->inp[1] != '?')
561         return 0;
562     c = 0;
563     switch (s->inp[2])
564     {
565         case '=':
566             c = '#';
567             break;
568         case '(':
569             c = '[';
570             break;
571         case '/':
572             c = '\\';
573             break;
574         case ')':
575             c = ']';
576             break;
577         case '\'':
578             c = '^';
579             break;
580         case '<':
581             c = '{';
582             break;
583         case '!':
584             c = '|';
585             break;
586         case '>':
587             c = '}';
588             break;
589         case '-':
590             c = '~';
591             break;
592     }
593     if (c)
594     {
595         *s->inp = c;
596         memmove(s->inp + 1, s->inp + 3, s->inl - s->inp + 2);
597         s->inl -= 2;
598     }
599     return c;
600 }
601 
602 int
603     foldline(Source * s)
604 {
605     int n = 1;
606 
607 	/* skip pending wihite spaces */
608 	while ((s->inp[n] == ' ') || (s->inp[n] == '\t'))
609 	{
610 		n++;
611 	    if ((s->inp + n >= s->inl) && (fillbuf(s) == EOF))
612 			break;
613 	}
614 
615 	/* refill buffer */
616     while (s->inp + (n + 1) >= s->inl && fillbuf(s) != EOF);
617 
618     /* skip DOS line ends */
619     if (((s->inp[n] == '\r') && (s->inp[n+1] == '\n')) ||
620 		((s->inp[n] == '\n') && (s->inp[n+1] == '\r')))
621         n++;
622 
623     if ((s->inp[n] == '\n') || (s->inp[n] == '\r'))
624     {
625         memmove(s->inp, s->inp + n + 1, s->inl - s->inp + n + 2);
626         s->inl -= n + 1;
627         return 1;
628     }
629     return 0;
630 }
631 
632 int
633     fillbuf(Source * s)
634 {
635     int n;
636 
637     if (s->fd < 0 || (n = read(s->fd, (char *) s->inl, INS / 8)) <= 0)
638         n = 0;
639     s->inl += n;
640     s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOB;
641     if (n == 0)
642     {
643         s->inl[0] = s->inl[1] = s->inl[2] = s->inl[3] = EOFC;
644         return EOF;
645     }
646     return 0;
647 }
648 
649 /*
650  * Push down to new source of characters.
651  * If fd>0 and str==NULL, then from a file `name';
652  * if fd==-1 and str, then from the string.
653  */
654 Source *
655     setsource(char *name, int path, int fd, char *str, int wrap)
656 {
657     Source *s = new(Source);
658     int len;
659 
660     s->line = 1;
661     s->lineinc = 0;
662     s->fd = fd;
663     s->filename = name;
664     s->next = cursource;
665     s->ifdepth = 0;
666     s->pathdepth = path;
667 	s->wrap = wrap;
668 
669     cursource = s;
670 
671 	if (s->wrap)
672 		genwrap(0);
673 
674     /* slop at right for EOB */
675     if (str)
676     {
677         len = strlen(str);
678         s->inb = domalloc(len + 4);
679         s->inp = s->inb;
680         strncpy((char *) s->inp, str, len);
681     }
682     else
683     {
684         s->inb = domalloc(INS + 4);
685         s->inp = s->inb;
686         len = 0;
687     }
688     s->inl = s->inp + len;
689     s->inl[0] = s->inl[1] = EOB;
690 
691     return s;
692 }
693 
694 void
695     unsetsource(void)
696 {
697     Source *s = cursource;
698 
699 	if (s->wrap)
700 		genwrap(1);
701 
702     if (s->fd >= 0)
703     {
704         close(s->fd);
705         dofree(s->inb);
706     }
707     cursource = s->next;
708     dofree(s);
709 }
710