grammardgen source code

1 module grammardgen;
2 import dparsergen.core.dynamictree;
3 import dparsergen.core.location;
4 import dparsergen.core.nodetype;
5 import dparsergen.core.utils;
6 import std.algorithm;
7 import std.array;
8 import std.conv;
9 import std.process;
10 import std.stdio;
11 import std.string;
12 
13 static import grammarddoc_lexer;
14 
15 alias Tree = DynamicParseTree!(LocationAll, LocationRangeStartEnd);
16 
17 void printTree(Tree tree, ref string output)
18 {
19     if (tree is null)
20         return;
21     if (tree.isToken)
22         output ~= tree.content;
23 
24     foreach (c; tree.childs)
25         printTree(c, output);
26 }
27 
28 struct Symbol
29 {
30     string name;
31     bool isToken;
32     bool hasOpt;
33 }
34 
35 void printSymbol(Tree tree, ref Symbol[] output)
36 {
37     if (tree.name == "WS")
38     {
39         if (output[$ - 1].name.length)
40         {
41             output.length++;
42             output[$ - 1].isToken = output[$ - 2].isToken;
43         }
44     }
45     else if (tree.name == "Macro")
46     {
47         Tree[] content = tree.childs[2].childs;
48         while (content.length && content[0].name == "WS")
49             content = content[1 .. $];
50         if (tree.childs[1].content == "D")
51         {
52             output[$ - 1].isToken = true;
53             foreach (c; content)
54                 printSymbol(c, output);
55         }
56         else if (tree.childs[1].content == "B")
57         {
58             output[$ - 1].isToken = true;
59             foreach (c; content)
60                 printSymbol(c, output);
61         }
62         else if (tree.childs[1].content == "D].")
63         {
64             output[$ - 1].isToken = true;
65             output[$ - 1].name ~= "]";
66             output.length++;
67             output[$ - 1].isToken = true;
68             output[$ - 1].name ~= ".";
69         }
70         else if (tree.childs[1].content == "I")
71         {
72             foreach (c; content)
73                 printSymbol(c, output);
74         }
75         else if (tree.childs[1].content == "RELATIVE_LINK2")
76         {
77             size_t paramStart = 0;
78             foreach (i, c; content)
79                 if (c.name == "Comma")
80                 {
81                     paramStart = i + 1;
82                     break;
83                 }
84             foreach (c; content[paramStart .. $])
85                 printSymbol(c, output);
86         }
87         else if (tree.childs[1].content == "GLINK2")
88         {
89             size_t paramStart = 0;
90             foreach (i, c; content)
91                 if (c.name == "Comma")
92                 {
93                     paramStart = i + 1;
94                     break;
95                 }
96             foreach (c; content[paramStart .. $])
97                 printSymbol(c, output);
98         }
99         else if (tree.childs[1].content == "LINK2")
100         {
101             size_t paramStart = 0;
102             foreach (i, c; content)
103                 if (c.name == "Comma")
104                 {
105                     paramStart = i + 1;
106                     break;
107                 }
108             foreach (c; content[paramStart .. $])
109                 printSymbol(c, output);
110         }
111         else if (tree.childs[1].content == "GLINK")
112         {
113             size_t paramStart = 0;
114             foreach (i, c; content)
115                 if (c.name == "Comma")
116                 {
117                     paramStart = i + 1;
118                     break;
119                 }
120             foreach (c; content[paramStart .. $])
121                 printSymbol(c, output);
122         }
123         else if (tree.childs[1].content == "GSELF")
124         {
125             size_t paramStart = 0;
126             foreach (i, c; content)
127                 if (c.name == "Comma")
128                 {
129                     paramStart = i + 1;
130                     break;
131                 }
132             foreach (c; content[paramStart .. $])
133                 printSymbol(c, output);
134         }
135         else if (tree.childs[1].content == "DDSUBLINK")
136         {
137             size_t paramStart = 0;
138             foreach (i, c; content)
139                 if (c.name == "Comma")
140                 {
141                     paramStart = i + 1;
142                 }
143             foreach (c; content[paramStart .. $])
144                 printSymbol(c, output);
145         }
146         else if (tree.childs[1].content == "GLINK_LEX")
147         {
148             foreach (c; content[0 .. 1])
149                 printSymbol(c, output);
150         }
151         else if (tree.childs[1].content == "LPAREN")
152         {
153             output[$ - 1].name ~= "(";
154             output[$ - 1].isToken = true;
155         }
156         else if (tree.childs[1].content == "RPAREN")
157         {
158             output[$ - 1].name ~= ")";
159             output[$ - 1].isToken = true;
160         }
161         else if (tree.childs[1].content == "CODE_LCURL")
162         {
163             output[$ - 1].name ~= "{";
164             output[$ - 1].isToken = true;
165         }
166         else if (tree.childs[1].content == "CODE_RCURL")
167         {
168             output[$ - 1].name ~= "}";
169             output[$ - 1].isToken = true;
170         }
171         else if (tree.childs[1].content == "CODE_PERCENT")
172         {
173             output[$ - 1].name ~= "%";
174             output[$ - 1].isToken = true;
175         }
176         else if (tree.childs[1].content == "BACKTICK")
177         {
178             output[$ - 1].name ~= "`";
179             output[$ - 1].isToken = true;
180         }
181         else if (tree.childs[1].content == "CODE_AMP")
182         {
183             output[$ - 1].name ~= "&";
184             output[$ - 1].isToken = true;
185         }
186         else if (tree.childs[1].content == "AMP")
187         {
188             output[$ - 1].name ~= "&";
189             output[$ - 1].isToken = true;
190         }
191         else if (tree.childs[1].content == "IDENTIFIER")
192         {
193             output[$ - 1].name ~= "Identifier";
194         }
195         else if (tree.childs[1].content == "EXPRESSION")
196         {
197             output[$ - 1].name ~= "Expression";
198         }
199         else if (tree.childs[1].content == "ASSIGNEXPRESSION")
200         {
201             output[$ - 1].name ~= "AssignExpression";
202         }
203         else if (tree.childs[1].content == "PSCURLYSCOPE")
204         {
205             output[$ - 1].name ~= "NonEmptyOrScopeBlockStatement";
206         }
207         else if (tree.childs[1].content == "PSSCOPE")
208         {
209             output[$ - 1].name ~= "ScopeStatement";
210         }
211         else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE_LIST")
212         {
213             output[$ - 1].name ~= "ScopeStatementList";
214         }
215         else if (tree.childs[1].content == "PS0")
216         {
217             output[$ - 1].name ~= "NoScopeNonEmptyStatement";
218         }
219         else if (tree.childs[1].content == "PSSEMI")
220         {
221             output[$ - 1].name ~= "NoScopeStatement";
222         }
223         else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE")
224         {
225             output[$ - 1].name ~= "Statement";
226         }
227         else
228         {
229             output[$ - 1].name ~= "$(" ~ tree.childs[1].content;
230             foreach (c; content)
231                 printSymbol(c, output);
232             output[$ - 1].name ~= ")";
233         }
234     }
235     else
236     {
237         string names;
238         printTree(tree, names);
239         foreach (i, name; names.split())
240         {
241             if (name.length >= 3 && name[0] == '`' && name[$ - 1] == '`')
242             {
243                 if (output[$ - 1].name.length)
244                 {
245                     output.length++;
246                 }
247                 output[$ - 1].isToken = true;
248                 name = name[1 .. $ - 1];
249             }
250             else if (i)
251             {
252                 output.length++;
253                 output[$ - 1].isToken = output[$ - 2].isToken;
254             }
255             output[$ - 1].name ~= name;
256         }
257     }
258 }
259 
260 class Context
261 {
262     string[string] nonterminals;
263     string[] nonterminalsOrder;
264     bool[string] tokens;
265     bool isLexer;
266 }
267 
268 void analyzeNonterminal(Tree[] trees, Context context, bool isToken)
269 {
270     string name;
271     assert(trees[0].name == "Macro");
272     assert(trees[0].childs[1].content == "GNAME");
273     foreach (c; trees[0].childs[2].childs)
274         if (c.name == "Text")
275             name = c.childs[0].content;
276     trees = trees[1 .. $];
277     while (true)
278     {
279         assert(trees.length);
280         if (trees[0].name == "Text" && trees[0].childs[0].content == ":")
281         {
282             trees = trees[1 .. $];
283         }
284         else if (trees[0].name == "Macro" && trees[0].childs[1].content == "LEGACY_LNAME2")
285         {
286             trees = trees[1 .. $];
287         }
288         else if (trees[0].name == "NL")
289         {
290             trees = trees[1 .. $];
291             break;
292         }
293         else
294         {
295             assert(false, text(trees[0]));
296         }
297     }
298 
299     Symbol[][] symbols = [[Symbol()]];
300 
301     void findSymbols(Tree[] trees)
302     {
303         foreach (i, c; trees)
304         {
305             if (c.name == "NL")
306             {
307                 if (symbols[$ - 1].length > 1 || symbols[$ - 1][0].name.length)
308                 {
309                     symbols ~= [Symbol()];
310                 }
311             }
312             else if (c.name == "WS")
313             {
314                 if (symbols[$ - 1][$ - 1].name.length)
315                     symbols[$ - 1].length++;
316             }
317             else if (c.name == "Macro" && c.childs[1].content == "OPT")
318             {
319                 symbols[$ - 1][$ - 1].hasOpt = true;
320             }
321             else if (c.name == "Macro" && c.childs[1].content == "LEGACY_LNAME2")
322             {
323             }
324             else if (c.name == "Macro" && c.childs[1].content == "MULTICOLS")
325             {
326                 Tree[] trees2 = c.childs[2].childs;
327                 while (trees2.length)
328                 {
329                     if (trees2[0].name == "Comma")
330                     {
331                         trees2 = trees2[1 .. $];
332                         break;
333                     }
334                     trees2 = trees2[1 .. $];
335                 }
336                 findSymbols(trees2);
337             }
338             else
339             {
340                 printSymbol(c, symbols[$ - 1]);
341             }
342         }
343     }
344 
345     findSymbols(trees);
346 
347     foreach (ref output; symbols)
348     {
349         if (output[$ - 1].name.length == 0)
350             output.length--;
351         if (output.length && output[$ - 1].name.startsWith("(") && output[$ - 1].name.endsWith(")"))
352             output.length--;
353     }
354     if (symbols[$ - 1].length == 0)
355         symbols.length--;
356 
357     if (name == "Register" || name == "Register64")
358     {
359         Symbol[][] symbolsBak = symbols;
360         symbols = [];
361         foreach (output; symbolsBak)
362         {
363             foreach (s; output)
364                 symbols ~= [s];
365         }
366     }
367 
368     string[][string] tokensToSplit = [
369         "ST(0)": ["ST", "(", "0", ")"],
370         "ST(1)": ["ST", "(", "1", ")"],
371         "ST(2)": ["ST", "(", "2", ")"],
372         "ST(3)": ["ST", "(", "3", ")"],
373         "ST(4)": ["ST", "(", "4", ")"],
374         "ST(5)": ["ST", "(", "5", ")"],
375         "ST(6)": ["ST", "(", "6", ")"],
376         "ST(7)": ["ST", "(", "7", ")"],
377         "!is": ["!", "is"],
378         "!in": ["!", "in"],
379         ");": [")", ";"],
380         "scope(success)": ["scope", "(", "success", ")"],
381         "scope(exit)": ["scope", "(", "exit", ")"],
382         "scope(failure)": ["scope", "(", "failure", ")"],
383         "C++": ["C", "++"],
384         "C++,": ["C", "++", ","],
385         "Objective - C": ["Objective", "-", "C"],
386         "( )": ["(", ")"],
387     ];
388     foreach (i, ref output; symbols)
389     {
390         Symbol[] output2;
391         foreach (s; output)
392         {
393             if (s.isToken && s.name in tokensToSplit)
394             {
395                 assert(!s.hasOpt);
396                 foreach (x; tokensToSplit[s.name])
397                 {
398                     output2 ~= Symbol(x, true);
399                 }
400             }
401             else
402                 output2 ~= s;
403         }
404         output = output2;
405     }
406 
407     if (name.endsWith("String"))
408         isToken = true;
409     if (name == "Token" || name == "Keyword" || name == "StringLiteral" || name == "TokenString")
410         isToken = false;
411     string code;
412     if (isToken)
413         code ~= "token ";
414     code ~= name;
415     if (name.endsWith("Comment") || name == "SpecialTokenSequence"
416             || name == "EndOfLine" || name == "WhiteSpace")
417         code ~= " @IgnoreToken";
418     if (name == "Module")
419         code ~= " @Start";
420     if (name == "Identifier")
421         code ~= " @LowPrio";
422     if (isToken)
423         context.tokens[name] = true;
424     code ~= "\n";
425     foreach (i, ref output; symbols)
426     {
427         if (name == "ParameterAttributes" && output.length == 1
428                 && output[0].name == "ParameterAttributes")
429             continue;
430 
431         if (name == "NestingBlockComment")
432         {
433             output.length++;
434             output[$ - 1] = output[$ - 2];
435             output[$ - 2].isToken = false;
436             output[$ - 2].name = "/(+*)/";
437         }
438         if (name == "NestingBlockCommentCharacter" && output[0].name == "NestingBlockComment")
439         {
440             output.length++;
441             output[1] = output[0];
442             output[0].isToken = false;
443             output[0].name = "/(\\/*)/";
444         }
445         if (name == "NestingBlockCommentCharacters" && output.length == 1)
446         {
447             output[0].isToken = false;
448             output[0].name = "eps";
449         }
450         if ((name == "DoubleQuotedCharacters" || name == "WysiwygCharacters"
451                 || name == "HexStringChars") && output.length == 1)
452         {
453             output[0].isToken = false;
454             output[0].name = "eps";
455         }
456 
457         if (i)
458             code ~= "    |";
459         else
460             code ~= "    =";
461         foreach (ref s; output)
462         {
463             if (s.name == ".." || s.name == "," || s.name == "=")
464                 s.isToken = true;
465             if (name == "TraitsKeyword")
466                 s.isToken = true;
467 
468             if (name == "BlockComment" && s.name == "Characters")
469             {
470                 s.isToken = false;
471                 s.name = "/(([^*]|[*][*]*[^\\/*])*[*]*)/";
472             }
473             if (name == "LineComment" && s.name == "Characters")
474             {
475                 s.isToken = false;
476                 s.name = "/([^\\n\\r\\u000D\\u000A\\u2028\\u2029\\0\\x1a]*)/";
477             }
478             if (name == "NestingBlockCommentCharacter" && s.name == "Character")
479             {
480                 s.isToken = false;
481                 s.name = "/[^+\\/]|++*[^+\\/]|\\/\\/*[^+\\/]/";
482             }
483             if (name == "DoubleQuotedCharacter" && s.name == "Character")
484             {
485                 s.isToken = false;
486                 s.name = "/[^\\\"\\\\]/";
487             }
488             if (name == "SingleQuotedCharacter" && s.name == "Character")
489             {
490                 s.isToken = false;
491                 s.name = "/[^\\\'\\\\]/";
492             }
493             if (name == "FloatLiteral" && s.name == "Integer")
494             {
495                 s.isToken = false;
496                 s.name = "DecimalInteger"; // TODO: only this?
497             }
498             if (name == "DeclDefs" && output.length == 2)
499             {
500                 if (s.name == "DeclDefs")
501                     s.name = "DeclDef";
502                 else
503                     s.name = "DeclDefs";
504             }
505 
506             if (s.isToken)
507             {
508                 if (!context.isLexer && s.name.length == 1 && s.name[0] >= '0' && s.name[0] <= '9')
509                 {
510                     code ~= " IntegerLiteral>>\"" ~ s.name ~ "\"";
511                 }
512                 else
513                 {
514                     string tname = s.name;
515                     if (tname.length == 6 && tname.startsWith("\\u"))
516                     {
517                     }
518                     else
519                         tname = tname.escapeD;
520                     code ~= " \"" ~ tname ~ "\"";
521                 }
522             }
523             else
524             {
525                 if (!context.isLexer && output.length == 1 && s.name[0] != '/'
526                         && s.name != "eps" && s.name !in context.tokens)
527                     code ~= " <" ~ s.name;
528                 else
529                     code ~= " " ~ s.name;
530             }
531             if (s.hasOpt)
532                 code ~= "?";
533         }
534         code ~= "\n";
535     }
536     code ~= "    ;\n";
537     if (name in context.nonterminals)
538     {
539         //assert(context.nonterminals[name] == code, text(code, "=================\n", context.nonterminals[name]));
540     }
541     else
542         context.nonterminalsOrder ~= name;
543     context.nonterminals[name] = code;
544 }
545 
546 void analyzeGrammar(Tree tree, Context context)
547 {
548     if (tree is null)
549         return;
550     size_t start = size_t.max;
551     bool isToken = context.isLexer;
552     foreach (i, c; tree.childs[2].childs)
553     {
554         if (c.name == "Macro" && c.childs[1].content == "GNAME")
555         {
556             if (start != size_t.max)
557             {
558                 analyzeNonterminal(tree.childs[2].childs[start .. i], context, isToken);
559                 isToken = false;
560             }
561             start = i;
562         }
563     }
564     if (start != size_t.max)
565         analyzeNonterminal(tree.childs[2].childs[start .. $], context, isToken);
566 }
567 
568 void findGrammar(Tree tree, Context context)
569 {
570     if (tree is null)
571         return;
572     if (tree.nodeType == NodeType.nonterminal && tree.name == "Macro"
573             && tree.childs[1].content.among("GRAMMAR", "GRAMMAR_LEX"))
574     {
575         analyzeGrammar(tree, context);
576     }
577     else
578     {
579         foreach (c; tree.childs)
580             findGrammar(c, context);
581     }
582 }
583 
584 int main(string[] args)
585 {
586     import P = grammarddoc;
587     import std.file;
588     import std.path;
589     import std.stdio;
590 
591     alias L = grammarddoc_lexer.Lexer!LocationAll;
592     alias Creator = DynamicParseTreeCreator!(P, LocationAll, LocationRangeStartEnd);
593     Creator creator = new Creator;
594 
595     if (args.length != 4)
596     {
597         stderr.writeln("Usage: grammarcppgen dlang.org grammard.ebnf grammardlex.ebnf");
598         return 1;
599     }
600     string dlangRepo = args[1];
601 
602     auto git = execute(["git", "-C", dlangRepo, "rev-parse", "HEAD"]);
603 
604     Context contextLex = new Context();
605     contextLex.isLexer = true;
606     foreach (f; ["lex"])
607     {
608         string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd";
609 
610         string inText = readText(filename);
611 
612         auto tree = P.parse!(Creator, L)(inText, creator);
613         assert(tree.inputLength.bytePos <= inText.length);
614 
615         findGrammar(tree, contextLex);
616         File of = File(args[3], "w");
617         if (git.status == 0)
618             of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n");
619         foreach (name; contextLex.nonterminalsOrder)
620         {
621             of.write(contextLex.nonterminals[name]);
622         }
623         of.writeln("Letter = [a-zA-Z];");
624         of.writeln("Tokens @Array = eps | Tokens Token;");
625     }
626 
627     Context context = new Context();
628     context.tokens = contextLex.tokens;
629 
630     foreach (f; [
631             "module", "expression", "declaration", "iasm", "attribute",
632             "statement", "template", "class", "traits", "function", "struct",
633             "unittest", "version", "template-mixin", "enum", "pragma", "interface",
634             "type"
635         ])
636     {
637         string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd";
638 
639         string inText = readText(filename);
640 
641         auto tree = P.parse!(Creator, L)(inText, creator);
642         assert(tree.inputLength.bytePos <= inText.length);
643 
644         findGrammar(tree, context);
645     }
646     foreach (name; [
647             "ParameterMemberAttributes", "FunctionAttributes", "TypeVector",
648             "Opcode"
649         ])
650     {
651         if (name in context.nonterminals)
652             continue;
653         context.nonterminals[name] = name ~ " = \"TODO\";\n";
654         context.nonterminalsOrder ~= name;
655     }
656     File of = File(args[2], "w");
657     if (git.status == 0)
658         of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n");
659     of.writeln("import \"grammardlex.ebnf\";");
660     foreach (name; context.nonterminalsOrder)
661     {
662         of.write(context.nonterminals[name]);
663     }
664     return 0;
665 }