grammardgen source code

1 module grammardgen;
2 import dparsergen.core.dynamictree;
3 import dparsergen.core.location;
4 import dparsergen.core.nodetype;
5 import dparsergen.core.utils;
6 import std.algorithm;
7 import std.array;
8 import std.conv;
9 import std.process;
10 import std.stdio;
11 import std.string;
12 
13 static import grammarddoc_lexer;
14 
15 alias Tree = DynamicParseTree!(LocationAll, LocationRangeStartEnd);
16 
17 void printTree(Tree tree, ref string output)
18 {
19     if (tree is null)
20         return;
21     if (tree.isToken)
22         output ~= tree.content;
23 
24     foreach (c; tree.childs)
25         printTree(c, output);
26 }
27 
28 struct Symbol
29 {
30     string name;
31     bool isToken;
32     bool hasOpt;
33 }
34 
35 void printSymbol(Tree tree, ref Symbol[] output)
36 {
37     if (tree.name == "WS")
38     {
39         if (output[$ - 1].name.length)
40         {
41             output.length++;
42             output[$ - 1].isToken = output[$ - 2].isToken;
43         }
44     }
45     else if (tree.name == "Macro")
46     {
47         Tree[] content = tree.childs[2].childs;
48         while (content.length && content[0].name == "WS")
49             content = content[1 .. $];
50         if (tree.childs[1].content == "D")
51         {
52             output[$ - 1].isToken = true;
53             foreach (c; content)
54                 printSymbol(c, output);
55         }
56         else if (tree.childs[1].content == "B")
57         {
58             output[$ - 1].isToken = true;
59             foreach (c; content)
60                 printSymbol(c, output);
61         }
62         else if (tree.childs[1].content == "D].")
63         {
64             output[$ - 1].isToken = true;
65             output[$ - 1].name ~= "]";
66             output.length++;
67             output[$ - 1].isToken = true;
68             output[$ - 1].name ~= ".";
69         }
70         else if (tree.childs[1].content == "I")
71         {
72             foreach (c; content)
73                 printSymbol(c, output);
74         }
75         else if (tree.childs[1].content == "RELATIVE_LINK2")
76         {
77             size_t paramStart = 0;
78             foreach (i, c; content)
79                 if (c.name == "Comma")
80                 {
81                     paramStart = i + 1;
82                     break;
83                 }
84             foreach (c; content[paramStart .. $])
85                 printSymbol(c, output);
86         }
87         else if (tree.childs[1].content == "GLINK2")
88         {
89             size_t paramStart = 0;
90             foreach (i, c; content)
91                 if (c.name == "Comma")
92                 {
93                     paramStart = i + 1;
94                     break;
95                 }
96             foreach (c; content[paramStart .. $])
97                 printSymbol(c, output);
98         }
99         else if (tree.childs[1].content == "LINK2")
100         {
101             size_t paramStart = 0;
102             foreach (i, c; content)
103                 if (c.name == "Comma")
104                 {
105                     paramStart = i + 1;
106                     break;
107                 }
108             foreach (c; content[paramStart .. $])
109                 printSymbol(c, output);
110         }
111         else if (tree.childs[1].content == "GLINK")
112         {
113             size_t paramStart = 0;
114             foreach (i, c; content)
115                 if (c.name == "Comma")
116                 {
117                     paramStart = i + 1;
118                     break;
119                 }
120             foreach (c; content[paramStart .. $])
121                 printSymbol(c, output);
122         }
123         else if (tree.childs[1].content == "GSELF")
124         {
125             size_t paramStart = 0;
126             foreach (i, c; content)
127                 if (c.name == "Comma")
128                 {
129                     paramStart = i + 1;
130                     break;
131                 }
132             foreach (c; content[paramStart .. $])
133                 printSymbol(c, output);
134         }
135         else if (tree.childs[1].content == "DDSUBLINK")
136         {
137             size_t paramStart = 0;
138             foreach (i, c; content)
139                 if (c.name == "Comma")
140                 {
141                     paramStart = i + 1;
142                 }
143             foreach (c; content[paramStart .. $])
144                 printSymbol(c, output);
145         }
146         else if (tree.childs[1].content == "GLINK_LEX")
147         {
148             foreach (c; content[0 .. 1])
149                 printSymbol(c, output);
150         }
151         else if (tree.childs[1].content == "LPAREN")
152         {
153             output[$ - 1].name ~= "(";
154             output[$ - 1].isToken = true;
155         }
156         else if (tree.childs[1].content == "RPAREN")
157         {
158             output[$ - 1].name ~= ")";
159             output[$ - 1].isToken = true;
160         }
161         else if (tree.childs[1].content == "CODE_LCURL")
162         {
163             output[$ - 1].name ~= "{";
164             output[$ - 1].isToken = true;
165         }
166         else if (tree.childs[1].content == "CODE_RCURL")
167         {
168             output[$ - 1].name ~= "}";
169             output[$ - 1].isToken = true;
170         }
171         else if (tree.childs[1].content == "CODE_PERCENT")
172         {
173             output[$ - 1].name ~= "%";
174             output[$ - 1].isToken = true;
175         }
176         else if (tree.childs[1].content == "BACKTICK")
177         {
178             output[$ - 1].name ~= "`";
179             output[$ - 1].isToken = true;
180         }
181         else if (tree.childs[1].content == "CODE_AMP")
182         {
183             output[$ - 1].name ~= "&";
184             output[$ - 1].isToken = true;
185         }
186         else if (tree.childs[1].content == "AMP")
187         {
188             output[$ - 1].name ~= "&";
189             output[$ - 1].isToken = true;
190         }
191         else if (tree.childs[1].content == "IDENTIFIER")
192         {
193             output[$ - 1].name ~= "Identifier";
194         }
195         else if (tree.childs[1].content == "EXPRESSION")
196         {
197             output[$ - 1].name ~= "Expression";
198         }
199         else if (tree.childs[1].content == "ASSIGNEXPRESSION")
200         {
201             output[$ - 1].name ~= "AssignExpression";
202         }
203         else if (tree.childs[1].content == "PSCURLYSCOPE")
204         {
205             output[$ - 1].name ~= "NonEmptyOrScopeBlockStatement";
206         }
207         else if (tree.childs[1].content == "PSSCOPE")
208         {
209             output[$ - 1].name ~= "ScopeStatement";
210         }
211         else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE_LIST")
212         {
213             output[$ - 1].name ~= "ScopeStatementList";
214         }
215         else if (tree.childs[1].content == "PS0")
216         {
217             output[$ - 1].name ~= "NoScopeNonEmptyStatement";
218         }
219         else if (tree.childs[1].content == "PSSEMI")
220         {
221             output[$ - 1].name ~= "NoScopeStatement";
222         }
223         else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE")
224         {
225             output[$ - 1].name ~= "Statement";
226         }
227         else
228         {
229             output[$ - 1].name ~= "$(" ~ tree.childs[1].content;
230             foreach (c; content)
231                 printSymbol(c, output);
232             output[$ - 1].name ~= ")";
233         }
234     }
235     else
236     {
237         string names;
238         printTree(tree, names);
239         foreach (i, name; names.split())
240         {
241             if (name.length >= 3 && name[0] == '`' && name[$ - 1] == '`')
242             {
243                 if (output[$ - 1].name.length)
244                 {
245                     output.length++;
246                 }
247                 output[$ - 1].isToken = true;
248                 name = name[1 .. $ - 1];
249             }
250             else if (i)
251             {
252                 output.length++;
253                 output[$ - 1].isToken = output[$ - 2].isToken;
254             }
255             output[$ - 1].name ~= name;
256         }
257     }
258 }
259 
260 class Context
261 {
262     string[string] nonterminals;
263     string[] nonterminalsOrder;
264     bool[string] tokens;
265     bool isLexer;
266 }
267 
268 bool isArrayNonterminal(string name)
269 {
270     bool isArray;
271     if (name.endsWith("List") || name.endsWith("Array") || name.endsWith("Attributes") || name.endsWith("Empty"))
272         isArray = true;
273     if (name.among("AliasAssignments", "AnonymousEnumMembers", "ArrayMemberInitializations", "AttributesNoPragma", "AutoAssignments", "Declarators", "DeclDefs", "EnumMembers", "FunctionContracts", "FunctionContractsEndingInOutContractExpression", "FunctionContractsEndingInOutStatement", "InOutContractExpressions", "Interfaces", "KeyValuePairs", "Slice", "Slice2", "StatementListNoCaseNoDefault", "StorageClasses", "StorageClassesAttributesNoPragma", "StructMemberInitializers", "TraitsArguments", "TypeCtors"))
274         isArray = true;
275     return isArray;
276 }
277 
278 void analyzeNonterminal(Tree[] trees, Context context, bool isLexer, bool isToken)
279 {
280     string name;
281     assert(trees[0].name == "Macro");
282     assert(trees[0].childs[1].content == "GNAME");
283     foreach (c; trees[0].childs[2].childs)
284         if (c.name == "Text")
285             name = c.childs[0].content;
286     trees = trees[1 .. $];
287     while (true)
288     {
289         assert(trees.length);
290         if (trees[0].name == "Text" && trees[0].childs[0].content == ":")
291         {
292             trees = trees[1 .. $];
293         }
294         else if (trees[0].name == "Macro" && trees[0].childs[1].content == "LEGACY_LNAME2")
295         {
296             trees = trees[1 .. $];
297         }
298         else if (trees[0].name == "NL")
299         {
300             trees = trees[1 .. $];
301             break;
302         }
303         else
304         {
305             assert(false, text(trees[0]));
306         }
307     }
308 
309     Symbol[][] symbols = [[Symbol()]];
310 
311     void findSymbols(Tree[] trees)
312     {
313         foreach (i, c; trees)
314         {
315             if (c.name == "NL")
316             {
317                 if (symbols[$ - 1].length > 1 || symbols[$ - 1][0].name.length)
318                 {
319                     symbols ~= [Symbol()];
320                 }
321             }
322             else if (c.name == "WS")
323             {
324                 if (symbols[$ - 1][$ - 1].name.length)
325                     symbols[$ - 1].length++;
326             }
327             else if (c.name == "Macro" && c.childs[1].content == "OPT")
328             {
329                 symbols[$ - 1][$ - 1].hasOpt = true;
330             }
331             else if (c.name == "Macro" && c.childs[1].content == "LEGACY_LNAME2")
332             {
333             }
334             else if (c.name == "Macro" && c.childs[1].content == "MULTICOLS")
335             {
336                 Tree[] trees2 = c.childs[2].childs;
337                 while (trees2.length)
338                 {
339                     if (trees2[0].name == "Comma")
340                     {
341                         trees2 = trees2[1 .. $];
342                         break;
343                     }
344                     trees2 = trees2[1 .. $];
345                 }
346                 findSymbols(trees2);
347             }
348             else
349             {
350                 printSymbol(c, symbols[$ - 1]);
351             }
352         }
353     }
354 
355     findSymbols(trees);
356 
357     foreach (ref output; symbols)
358     {
359         if (output[$ - 1].name.length == 0)
360             output.length--;
361         if (output.length && output[$ - 1].name.startsWith("(") && output[$ - 1].name.endsWith(")"))
362             output.length--;
363     }
364     if (symbols[$ - 1].length == 0)
365         symbols.length--;
366 
367     if (name == "Register" || name == "Register64")
368     {
369         Symbol[][] symbolsBak = symbols;
370         symbols = [];
371         foreach (output; symbolsBak)
372         {
373             foreach (s; output)
374                 symbols ~= [s];
375         }
376     }
377 
378     string[][string] tokensToSplit = [
379         "ST(0)": ["ST", "(", "0", ")"],
380         "ST(1)": ["ST", "(", "1", ")"],
381         "ST(2)": ["ST", "(", "2", ")"],
382         "ST(3)": ["ST", "(", "3", ")"],
383         "ST(4)": ["ST", "(", "4", ")"],
384         "ST(5)": ["ST", "(", "5", ")"],
385         "ST(6)": ["ST", "(", "6", ")"],
386         "ST(7)": ["ST", "(", "7", ")"],
387         "!is": ["!", "is"],
388         "!in": ["!", "in"],
389         ");": [")", ";"],
390         "scope(success)": ["scope", "(", "success", ")"],
391         "scope(exit)": ["scope", "(", "exit", ")"],
392         "scope(failure)": ["scope", "(", "failure", ")"],
393         "C++": ["C", "++"],
394         "C++,": ["C", "++", ","],
395         "Objective - C": ["Objective", "-", "C"],
396         "( )": ["(", ")"],
397     ];
398     foreach (i, ref output; symbols)
399     {
400         Symbol[] output2;
401         foreach (s; output)
402         {
403             if (s.isToken && s.name in tokensToSplit)
404             {
405                 assert(!s.hasOpt);
406                 foreach (x; tokensToSplit[s.name])
407                 {
408                     output2 ~= Symbol(x, true);
409                 }
410             }
411             else
412                 output2 ~= s;
413         }
414         output = output2;
415     }
416 
417     bool isNonterminal;
418     if (name.endsWith("String"))
419         isToken = true;
420     if (name.among("Token", "Keyword", "StringLiteral", "TokenString", "SourceFile"))
421     {
422         isToken = false;
423         isNonterminal = true;
424     }
425     bool isArray = isArrayNonterminal(name);
426     string code;
427     if (isToken)
428         code ~= "token ";
429     else if (isLexer && !isNonterminal)
430         code ~= "fragment ";
431     code ~= name;
432     if (name.endsWith("Comment") || name == "SpecialTokenSequence"
433             || name == "EndOfLine" || name == "WhiteSpace")
434         code ~= " @ignoreToken";
435     if (name == "SourceFile")
436         code ~= " @start";
437     if (name == "Identifier")
438         code ~= " @lowPrio";
439     if (isArray)
440         code ~= " @array @regArray";
441     if (isToken)
442         context.tokens[name] = true;
443     code ~= "\n";
444     foreach (i, ref output; symbols)
445     {
446         if (name == "ParameterAttributes" && output.length == 1
447                 && output[0].name == "ParameterAttributes")
448             continue;
449 
450         if (i)
451             code ~= "    |";
452         else
453             code ~= "    =";
454         foreach (ref s; output)
455         {
456             if (s.name == ".." || s.name == "," || s.name == "=")
457                 s.isToken = true;
458             if (name == "TraitsKeyword")
459                 s.isToken = true;
460 
461             if (s.isToken)
462             {
463                 if (!context.isLexer && s.name.length == 1 && s.name[0] >= '0' && s.name[0] <= '9')
464                 {
465                     code ~= " IntegerLiteral>>\"" ~ s.name ~ "\"";
466                 }
467                 else
468                 {
469                     string tname = s.name;
470                     if (tname.length == 6 && tname.startsWith("\\u"))
471                     {
472                     }
473                     else
474                         tname = tname.escapeD;
475                     code ~= " \"" ~ tname ~ "\"";
476                 }
477             }
478             else
479             {
480                 if (!context.isLexer && output.length == 1 && s.name[0] != '/'
481                         && s.name != "@empty" && s.name !in context.tokens && !isArray && !isArrayNonterminal(s.name))
482                     code ~= " <" ~ s.name;
483                 else
484                     code ~= " " ~ s.name;
485             }
486             if (s.hasOpt)
487                 code ~= "?";
488         }
489         code ~= "\n";
490     }
491     code ~= "    ;\n";
492     if (name in context.nonterminals)
493     {
494         //assert(context.nonterminals[name] == code, text(code, "=================\n", context.nonterminals[name]));
495     }
496     else
497         context.nonterminalsOrder ~= name;
498     context.nonterminals[name] = code;
499 }
500 
501 void analyzeGrammar(Tree tree, Context context)
502 {
503     if (tree is null)
504         return;
505     size_t start = size_t.max;
506     bool isToken = context.isLexer;
507     foreach (i, c; tree.childs[2].childs)
508     {
509         if (c.name == "Macro" && c.childs[1].content == "GNAME")
510         {
511             if (start != size_t.max)
512             {
513                 analyzeNonterminal(tree.childs[2].childs[start .. i], context, context.isLexer, isToken);
514                 isToken = false;
515             }
516             start = i;
517         }
518     }
519     if (start != size_t.max)
520         analyzeNonterminal(tree.childs[2].childs[start .. $], context, context.isLexer, isToken);
521 }
522 
523 void findGrammar(Tree tree, Context context)
524 {
525     if (tree is null)
526         return;
527     if (tree.nodeType == NodeType.nonterminal && tree.name == "Macro"
528             && tree.childs[1].content.among("GRAMMAR", "GRAMMAR_LEX"))
529     {
530         analyzeGrammar(tree, context);
531     }
532     else
533     {
534         foreach (c; tree.childs)
535             findGrammar(c, context);
536     }
537 }
538 
539 int main(string[] args)
540 {
541     import P = grammarddoc;
542     import std.file;
543     import std.path;
544     import std.stdio;
545 
546     alias L = grammarddoc_lexer.Lexer!LocationAll;
547     alias Creator = DynamicParseTreeCreator!(P, LocationAll, LocationRangeStartEnd);
548     Creator creator = new Creator;
549 
550     if (args.length != 4)
551     {
552         stderr.writeln("Usage: grammarcppgen dlang.org grammard.ebnf grammardlex.ebnf");
553         return 1;
554     }
555     string dlangRepo = args[1];
556 
557     auto git = execute(["git", "-C", dlangRepo, "rev-parse", "HEAD"]);
558 
559     Context contextLex = new Context();
560     contextLex.isLexer = true;
561     foreach (f; ["lex"])
562     {
563         string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd";
564 
565         string inText = readText(filename);
566 
567         auto tree = P.parse!(Creator, L)(inText, creator);
568         assert(tree.inputLength.bytePos <= inText.length);
569 
570         findGrammar(tree, contextLex);
571     }
572 
573     Context context = new Context();
574     context.tokens = contextLex.tokens;
575 
576     foreach (f; [
577             "module", "expression", "declaration", "iasm", "attribute",
578             "statement", "template", "class", "traits", "function", "struct",
579             "unittest", "version", "template-mixin", "enum", "pragma", "interface",
580             "type"
581         ])
582     {
583         string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd";
584 
585         string inText = readText(filename);
586 
587         auto tree = P.parse!(Creator, L)(inText, creator);
588         assert(tree.inputLength.bytePos <= inText.length);
589 
590         findGrammar(tree, context);
591     }
592     foreach (name; [
593             "ParameterMemberAttributes", "FunctionAttributes", "TypeVector",
594             "Opcode"
595         ])
596     {
597         if (name in context.nonterminals)
598             continue;
599         context.nonterminals[name] = name ~ " = \"TODO\";\n";
600         context.nonterminalsOrder ~= name;
601     }
602 
603     File of = File(args[3], "w");
604     if (git.status == 0)
605         of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n");
606     foreach (name; contextLex.nonterminalsOrder)
607     {
608         if (name == "SourceFile")
609             continue;
610         of.write(contextLex.nonterminals[name]);
611     }
612     of.writeln("Letter = [a-zA-Z];");
613     of.writeln("Tokens @array = @empty | Tokens Token;");
614 
615     of = File(args[2], "w");
616     if (git.status == 0)
617         of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n");
618     of.writeln("import \"grammardlex.ebnf\";");
619     of.write(contextLex.nonterminals["SourceFile"]);
620     foreach (name; context.nonterminalsOrder)
621     {
622         of.write(context.nonterminals[name]);
623     }
624 
625     return 0;
626 }