1 
2 //          Copyright Tim Schendekehl 2023.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          https://www.boost.org/LICENSE_1_0.txt)
6 
7 module dparsergen.generator.generator;
8 import dparsergen.core.dynamictree;
9 import dparsergen.core.location;
10 import dparsergen.core.nodetype;
11 import dparsergen.core.parseexception;
12 import dparsergen.core.utils;
13 import dparsergen.generator.ebnf;
14 import dparsergen.generator.globaloptions;
15 import dparsergen.generator.grammar;
16 import dparsergen.generator.grammarebnf;
17 import dparsergen.generator.grammarebnf_lexer;
18 import dparsergen.generator.parser;
19 import dparsergen.generator.parsercodegen;
20 import dparsergen.generator.production;
21 import dparsergen.generator.glrparsercodegen;
22 import std.algorithm;
23 import std.conv;
24 import std.getopt;
25 import std.exception;
26 import std.file;
27 import std.path;
28 import std.stdio;
29 import std.string;
30 
31 void toAnnotations(Tree)(Tree tree, ref Declaration d)
32 {
33     if (tree !is null)
34     {
35         foreach (i, p; tree.childs)
36         {
37             assert(p.name == "Annotation");
38             assert(p.childs.length == 3);
39             if (p.childs[0].content == "@")
40             {
41                 string name = p.childs[1].content;
42                 if (p.childs[2]!is null)
43                 {
44                     name ~= "(" ~ concatTree(p.childs[2].childs[1]).strip() ~ ")";
45                 }
46                 d.annotations ~= name;
47             }
48             else
49                 assert(false);
50         }
51     }
52 }
53 
54 Declaration toDeclaration(Tree)(Tree tree)
55 {
56     if (tree.name == "SymbolDeclaration")
57     {
58         Declaration r;
59         auto childs = tree.childs;
60         assert(childs.length.among(5, 7));
61         if (childs[0]!is null)
62         {
63             assert(childs[0].name == "DeclarationType");
64             if (childs[0].childs[0].content == "token")
65                 r.type = DeclarationType.token;
66             if (childs[0].childs[0].content == "fragment")
67                 r.type = DeclarationType.fragment;
68         }
69         r.name = childs[1].content;
70         if (childs[2]!is null)
71         {
72             foreach (i, p; childs[2].childs[1].childs)
73             {
74                 if (i % 2 == 0)
75                 {
76                     assert(p.name == "MacroParameter");
77                     if (p.childs.length >= 2)
78                     {
79                         if (r.variadicParameterIndex != size_t.max)
80                             throw new Exception("too many variadic parameters");
81                         r.variadicParameterIndex = r.parameters.length;
82                     }
83                     r.parameters ~= p.childs[0].content;
84                 }
85                 else
86                     assert(p.content == ",");
87             }
88         }
89         childs[3].toAnnotations(r);
90         if (childs.length == 7)
91             r.exprTree = childs[$ - 2];
92         return r;
93     }
94     else
95         assert(0, tree.toString);
96 }
97 
98 typeof(next(Tree.init))[] toArray(alias next, Tree)(Tree tree, string listName)
99 {
100     if (tree is null)
101         return [];
102     if (tree.name == listName || tree.nodeType == NodeType.array)
103     {
104         typeof(next(tree))[] r;
105         foreach (c; tree.childs)
106         {
107             if (!c.isToken)
108                 r ~= toArray!next(c, listName);
109         }
110         return r;
111     }
112     if (tree.name != listName)
113         return [next(tree)];
114     assert(0);
115 }
116 
117 EBNF toEBNF(Tree)(Tree tree, DocComment[] docComments)
118 {
119     EBNF r;
120     assert(tree.name == "EBNF");
121     assert(tree.childs.length == 1);
122 
123     size_t numGlobalDocumentation;
124     while (docComments.length && docComments[numGlobalDocumentation].end.line <= tree.start.line)
125         numGlobalDocumentation++;
126     if (numGlobalDocumentation
127             && docComments[numGlobalDocumentation - 1].end.line >= tree.start.line - 1)
128     {
129         numGlobalDocumentation--;
130         while (numGlobalDocumentation
131                 && docComments[numGlobalDocumentation - 1].end.line
132                 >= docComments[numGlobalDocumentation].start.line - 1)
133             numGlobalDocumentation--;
134     }
135     foreach (comment; docComments[0 .. numGlobalDocumentation])
136     {
137         if (r.globalDocumentation.length)
138             r.globalDocumentation ~= "\n";
139         r.globalDocumentation ~= comment.content;
140     }
141     docComments = docComments[numGlobalDocumentation .. $];
142 
143     void convertTree(Tree tree)
144     {
145         if (tree.nodeType == NodeType.array)
146         {
147             foreach (c; tree.childs)
148             {
149                 convertTree(c);
150             }
151             return;
152         }
153 
154         if (tree.name == "SymbolDeclaration")
155         {
156             string documentation;
157             while (docComments.length && docComments[0].end.line <= tree.end.line)
158             {
159                 if (documentation.length)
160                     documentation ~= "\n";
161                 documentation ~= docComments[0].content;
162                 docComments = docComments[1 .. $];
163             }
164 
165             r.symbols ~= toDeclaration(tree);
166             r.symbols[$ - 1].documentation = documentation;
167         }
168         else if (tree.name == "MatchDeclaration")
169         {
170             assert(tree.childs.length == 4);
171             r.matchingTokens ~= [
172                 tree.childs[1].childs[0].content, tree.childs[2].childs[0].content
173             ];
174         }
175         else if (tree.name == "Import")
176         {
177             assert(tree.childs.length == 3);
178             r.imports ~= tree.childs[1].content;
179         }
180         else if (tree.name == "OptionDeclaration")
181         {
182             assert(tree.childs.length == 2);
183 
184             bool found = false;
185             static foreach (name; [
186                     "startTokenID", "startNonterminalID", "startProductionID"
187                 ])
188             {
189                 if (tree.childs[0].content == name)
190                 {
191                     assert(!found);
192                     found = true;
193                     mixin("r." ~ name ~ " = to!size_t(tree.childs[1].content);");
194                 }
195             }
196             assert(found);
197         }
198         else
199             assert(false, tree.name);
200     }
201 
202     convertTree(tree.childs[0]);
203     return r;
204 }
205 
206 struct DocComment
207 {
208     string content;
209     LocationAll start;
210     LocationAll end;
211 }
212 
213 struct LexerWrapper
214 {
215     Lexer!(LocationAll, true) lexer;
216 
217     alias Location = LocationAll;
218     alias LocationDiff = typeof(Location.init - Location.init);
219 
220     this(string input, Location startLocation = Location.init)
221     {
222         lexer = Lexer!(LocationAll, true)(input, startLocation);
223         skipIgnoredTokens();
224     }
225 
226     enum tokenID(string tok) = lexer.tokenID!(tok);
227 
228     string tokenName(size_t id)
229     {
230         return lexer.tokenName(id);
231     }
232 
233     ref front()
234     {
235         return lexer.front;
236     }
237 
238     bool empty()
239     {
240         return lexer.empty;
241     }
242 
243     void popFront()
244     {
245         lexer.popFront();
246         skipIgnoredTokens();
247     }
248 
249     DocComment[] docComments;
250 
251     string cleanDocComment(string ignoredChars)(string content)
252     {
253         while (content.length && content[0].inCharSet!(ignoredChars))
254             content = content[1 .. $];
255         if (content.startsWith("\r\n"))
256             content = content[2 .. $];
257         else if (content.startsWith("\n"))
258             content = content[1 .. $];
259         while (content.length && content[$ - 1].inCharSet!ignoredChars)
260             content = content[0 .. $ - 1];
261         bool first = true;
262         string commonPrefix;
263         foreach (line; content.lineSplitter)
264         {
265             line = line.stripRight;
266             size_t i;
267             if (first)
268             {
269                 while (i < line.length && line[i].inCharSet!ignoredChars)
270                     i++;
271             }
272             else
273             {
274                 while (i < line.length && i < commonPrefix.length && line[i] == commonPrefix[i])
275                     i++;
276                 if (i >= line.length)
277                     continue;
278             }
279             commonPrefix = line[0 .. i];
280             first = false;
281         }
282         string content2;
283         foreach (line; content.lineSplitter)
284         {
285             if (line.length > commonPrefix.length)
286                 content2 ~= line[commonPrefix.length .. $].stripRight ~ "\n";
287             else
288                 content2 ~= "\n";
289         }
290         return content2;
291     }
292 
293     void skipIgnoredTokens()
294     {
295         while (!lexer.empty && lexer.front.isIgnoreToken)
296         {
297             if (lexer.front.symbol == lexer.tokenID!"LineComment"
298                     && lexer.front.content.startsWith("///"))
299             {
300                 docComments ~= DocComment(lexer.front.content[3 .. $].strip() ~ "\n",
301                         lexer.front.currentLocation, lexer.front.currentTokenEnd);
302             }
303             if (lexer.front.symbol == lexer.tokenID!"BlockComment"
304                     && lexer.front.content.startsWith("/**"))
305             {
306                 string content = cleanDocComment!"* \t"(lexer.front.content[3 .. $ - 2]);
307                 docComments ~= DocComment(content, lexer.front.currentLocation,
308                         lexer.front.currentTokenEnd);
309             }
310             if (lexer.front.symbol == lexer.tokenID!"NestingBlockComment"
311                     && lexer.front.content.startsWith("/++"))
312             {
313                 string content = cleanDocComment!"+ \t"(lexer.front.content[3 .. $ - 2]);
314                 docComments ~= DocComment(content, lexer.front.currentLocation,
315                         lexer.front.currentTokenEnd);
316             }
317             lexer.popFront;
318         }
319     }
320 }
321 
322 EBNF parseEBNF2(string str, string filename)
323 {
324     alias Creator = DynamicParseTreeCreator!(dparsergen.generator.grammarebnf,
325             LocationAll, LocationRangeStartEnd);
326     auto creator = new Creator;
327     EBNF ebnf;
328     try
329     {
330         LexerWrapper lexer = LexerWrapper(str, LocationAll.init);
331         Tree tree = parse!(Creator, LexerWrapper)(lexer, creator);
332         ebnf = toEBNF(tree, lexer.docComments);
333         if (!lexer.empty)
334         {
335             throw new SingleParseException!LocationAll("input left after parse",
336                     lexer.front.currentLocation, lexer.front.currentTokenEnd);
337         }
338     }
339     catch (ParseException e)
340     {
341         string message;
342         auto e2 = cast(SingleParseException!LocationAll) e.maxEndException();
343         message ~= filename ~ ":";
344         if (e2 !is null)
345         {
346             message ~= text(e2.markStart.line + 1, ":", e2.markStart.offset, ":");
347         }
348         message ~= " ";
349         e2.toString(str, (data) { message ~= data; },
350                 ExceptionStringFlags.noBacktrace | ExceptionStringFlags.noLocation);
351         message = message.strip();
352         stderr.writeln(message);
353         throw e;
354     }
355 
356     return ebnf;
357 }
358 
359 EBNF[] readEBNFFiles(string firstfilename)
360 {
361     string input = readText(firstfilename);
362 
363     EBNF[] ebnfs = [input.parseEBNF2(firstfilename)];
364 
365     auto imports = new TodoList!string;
366     void addImports(string currentFilename, string[] importDecls)
367     {
368         foreach (filename; importDecls)
369         {
370             assert(filename[0] == '"');
371             assert(filename[$ - 1] == '"');
372             filename = filename[1 .. $ - 1];
373             filename = absolutePath(filename, dirName(absolutePath(currentFilename)));
374             imports.put(filename);
375         }
376     }
377 
378     addImports(firstfilename, ebnfs[0].imports);
379     foreach (filename; imports.keys)
380     {
381         string input2 = readText(filename);
382         EBNF ebnf2 = input2.parseEBNF2(filename);
383         ebnfs ~= ebnf2;
384         addImports(filename, ebnf2.imports);
385     }
386     return ebnfs;
387 }
388 
389 int main(string[] args)
390 {
391     GlobalOptions globalOptions = new GlobalOptions;
392     string packagename;
393     string parsermodule;
394     string lexermodule;
395     bool optimizationEmpty;
396     bool regexlookahead;
397     string outfilename;
398     string dotfilename;
399     string lexerfilename;
400     string lexerdfafilename;
401     string finalgrammarfilename;
402     string docfilenamehtml;
403     string docfilenamemd;
404 
405     auto helpInformation = getopt(
406         args,
407         "o|parser", "Output filename for parser", &outfilename,
408         "module", "Set module name for parser", &parsermodule,
409         "lexer-module", "Set module name for lexer", &lexermodule,
410         "package", "Set package for parser and lexer", &packagename,
411         "lexer", "Generate lexer in this file", &lexerfilename,
412         "glr", "Generate parser with GLR instead of LALR", &globalOptions.glrParser,
413         "combinedreduce", "Allows to resolve some conflicts", () { globalOptions.delayedReduce = DelayedReduce.combined; },
414         "mergesimilarstates", "Reduce number of parser states by combining some states", &globalOptions.mergeSimilarStates,
415         "lexer-dfa", "Generate graph for lexer", &lexerdfafilename,
416         "graph", "Generate graph for parser", &dotfilename,
417         "finalgrammar", "Generate grammar file after all grammar rewritings", &finalgrammarfilename,
418         "doc-html", "Generate documentation in HTML format", &docfilenamehtml,
419         "doc-md", "Generate documentation in Markdown format", &docfilenamemd,
420         "optdescent", "Try to make decisions in the parser earlier", &globalOptions.optimizationDescent,
421         "optempty", "Rewrite grammar to remove empty productions", &optimizationEmpty,
422         "regexlookahead", "Try to resolve conflicts with arbitrary lookahead", &regexlookahead,
423         );
424 
425     if (helpInformation.helpWanted || args.length != 2)
426     {
427         if (helpInformation.options[$ - 1].optShort == "-h")
428             helpInformation.options[$ - 1].help = "Print this help and exit";
429 
430         size_t ls, ll;
431         foreach (it; helpInformation.options)
432         {
433             ls = max(ls, it.optShort.length);
434             ll = max(ll, it.optLong.length);
435         }
436 
437         writeln("dparsergen grammar.ebnf [OPTIONS]");
438         foreach (it; helpInformation.options)
439         {
440             writefln("%*s %s%*s%s%s", ls, it.optShort,
441                 it.optLong, ll - it.optLong.length, "",
442                 it.help.length ? " " : "", it.help);
443         }
444 
445         return 0;
446     }
447 
448     string grammarfilename = args[1];
449 
450     if (parsermodule.length == 0)
451     {
452         parsermodule = baseName(outfilename, ".d");
453     }
454 
455     if (lexermodule.length == 0)
456     {
457         lexermodule = baseName(lexerfilename, ".d");
458     }
459 
460     if (packagename.length)
461     {
462         parsermodule = packagename ~ "." ~ parsermodule;
463         lexermodule = packagename ~ "." ~ lexermodule;
464     }
465 
466     if (globalOptions.glrParser)
467     {
468         globalOptions.delayedReduce = DelayedReduce.none;
469     }
470 
471     EBNF[] ebnfs;
472     try
473     {
474         ebnfs = readEBNFFiles(grammarfilename);
475     }
476     catch (ParseException e)
477     {
478         /* Already printed in parseEBNF2. */
479         return 1;
480     }
481     catch (Exception e)
482     {
483         if (e.msg == "Enforcement failed")
484             stderr.writeln(e);
485         else
486             stderr.writeln(e.msg);
487         return 1;
488     }
489 
490     EBNF ebnf = ebnfs[0];
491     foreach (ebnf2; ebnfs[1 .. $])
492     {
493         ebnf.symbols ~= ebnf2.symbols;
494         ebnf.matchingTokens ~= ebnf2.matchingTokens;
495     }
496 
497     if (docfilenamehtml.length)
498     {
499         import dparsergen.generator.doc;
500 
501         genDoc(ebnfs[0], docfilenamehtml, DocType.html);
502     }
503     if (docfilenamemd.length)
504     {
505         import dparsergen.generator.doc;
506 
507         genDoc(ebnfs[0], docfilenamemd, DocType.markdown);
508     }
509 
510     EBNFGrammar grammar;
511     try
512     {
513         grammar = ebnf.createGrammar();
514     }
515     catch (Exception e)
516     {
517         if (e.msg == "Enforcement failed")
518             stderr.writeln(e);
519         else
520             stderr.writeln(e.msg);
521         return 1;
522     }
523     if (globalOptions.glrParser)
524         grammar.tokens.id("$flushreduces");
525 
526     bool hasRegArray;
527     bool hasDeactivated;
528     void checkAnnotations(Annotations annotations)
529     {
530         if (annotations.contains!"regArray")
531             hasRegArray = true;
532         if (annotations.contains!"deactivated")
533             hasDeactivated = true;
534         if (annotations.contains!"directUnwrap")
535             globalOptions.directUnwrap = true;
536     }
537 
538     foreach (i; grammar.nonterminals.allIDs)
539         checkAnnotations(grammar.nonterminals[i].annotations);
540     foreach (i; grammar.tokens.allIDs)
541         checkAnnotations(grammar.tokens[i].annotations);
542     foreach (p; grammar.productions)
543     {
544         checkAnnotations(p.annotations);
545         foreach (s; p.symbols)
546             checkAnnotations(s.annotations);
547     }
548 
549     string symbolName(Symbol n)
550     {
551         if (n.id == size_t.max)
552             return "";
553         return grammar.getSymbolName(n);
554     }
555 
556     enforce(!hasRegArray || !hasDeactivated);
557 
558     if (hasRegArray)
559         grammar = createRegArrayGrammar(ebnf, grammar);
560 
561     if (optimizationEmpty)
562         grammar = createOptEmptyGrammar(ebnf, grammar);
563 
564     if (hasDeactivated)
565         grammar = createGrammarWithoutDeactivatedProductions(grammar);
566 
567     EBNFGrammar lexerGrammar;
568     if (lexerfilename.length || finalgrammarfilename)
569         lexerGrammar = createLexerGrammar(ebnf, grammar);
570 
571     if (finalgrammarfilename.length)
572     {
573         writeFinalGrammarFile(finalgrammarfilename, grammar, lexerGrammar);
574     }
575 
576     if (outfilename.length)
577     {
578         LRGraph graphWithDeactivated = null;
579         if (hasDeactivated)
580             graphWithDeactivated = makeLRGraph(grammar.origGrammar, globalOptions);
581         auto graph = makeLRGraph(grammar, globalOptions, graphWithDeactivated);
582         foreach (i, ref s; graph.states)
583         {
584             bool deactivated;
585             foreach (element; s.elements)
586                 if (element.production.annotations.contains!"deactivated"())
587                     deactivated = true;
588             if (deactivated)
589                 graph.states[i] = new LRGraphNode();
590         }
591 
592         const(char)[] output;
593         if (globalOptions.glrParser)
594             output = dparsergen.generator.glrparsercodegen.createParserModule(graph, parsermodule);
595         else
596             output = dparsergen.generator.parsercodegen.createParserModule(graph,
597                     parsermodule, regexlookahead);
598         std.file.write(outfilename, output);
599     }
600 
601     if (lexerfilename.length)
602     {
603         import dparsergen.generator.lexergenerator;
604 
605         try
606         {
607             const(char)[] output = createLexerCode(lexerGrammar, lexermodule, lexerdfafilename);
608             std.file.write(lexerfilename, output);
609         }
610         catch (Exception e)
611         {
612             if (e.msg == "Enforcement failed")
613                 stderr.writeln(e);
614             else
615                 stderr.writeln(e.msg);
616             return 1;
617         }
618     }
619 
620     return 0;
621 }