1 
2 //          Copyright Tim Schendekehl 2023.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          https://www.boost.org/LICENSE_1_0.txt)
6 
7 module dparsergen.generator.lexergenerator;
8 import dparsergen.core.utils;
9 import dparsergen.generator.codewriter;
10 import dparsergen.generator.grammar;
11 import dparsergen.generator.nfa;
12 import dparsergen.generator.parsercodegencommon;
13 import dparsergen.generator.production;
14 import std.algorithm;
15 import std.conv;
16 import std.exception;
17 import std.range;
18 import std.stdio;
19 import std.uni;
20 import std.utf;
21 
22 struct LexerAction
23 {
24     NonterminalID nonterminal = NonterminalID.invalid;
25     bool isIgnoreToken;
26     bool isNegLookahead;
27 }
28 
29 struct DCharToken
30 {
31     CodepointSet* codepoints;
32     EdgeFlags specialFlags;
33     immutable(SymbolInstance)[] recursiveSequence;
34     enum invalid = DCharToken();
35     string toString() const
36     {
37         if (codepoints)
38             return codepointSetToStr(*cast(CodepointSet*) codepoints);
39         if (recursiveSequence.length)
40             return "RecursiveLexer";
41         return text(specialFlags);
42     }
43 }
44 
45 string codepointSetToStr(CodepointSet set)
46 {
47     string r = "[";
48 
49     if (0x10ffff in set)
50     {
51         r ~= "^";
52         set = set.inverted;
53     }
54 
55     foreach (interval; set.byInterval)
56     {
57         if (interval[0] + 1 == interval[1])
58             r ~= interval[0].escapeCodePoint(true);
59         else
60             r ~= text(interval[0].escapeCodePoint(true), "-",
61                     escapeCodePoint(interval[1] - 1, true));
62     }
63     r ~= "]";
64     return r;
65 }
66 
67 CodepointSet restrictToAssumedCodepoints(CodepointSet set, CodepointSet assumed)
68 {
69     set = set & assumed;
70     foreach (interval; assumed.inverted.byInterval)
71     {
72         if ((interval[0] != 0 && interval[0] - 1 in set) && interval[1] in set)
73             set = set | CodepointSet(interval[0], interval[1]);
74     }
75     return set;
76 }
77 
78 string codepointSetToCode(string varCode, CodepointSet set,
79         CodepointSet assumed = CodepointSet().inverted)
80 {
81     bool inverted;
82     if (0x10ffff in set)
83     {
84         inverted = true;
85         set = set.inverted;
86     }
87     set = restrictToAssumedCodepoints(set, assumed);
88     return codepointSetToCode(varCode, set, inverted);
89 }
90 
91 string codepointSetToCode(string varCode, CodepointSet set, bool inverted)
92 {
93     if (set.empty)
94     {
95         return inverted.to!string;
96     }
97 
98     string r;
99 
100     if (set.byInterval.length == 1 && set.byInterval.front[0] + 1 == set.byInterval.front[1])
101     {
102         r ~= varCode;
103         r ~= (inverted) ? " != " : " == ";
104         r ~= text("'", set.byInterval.front[0].escapeCodePoint(false), "'");
105     }
106     else
107     {
108         if (inverted)
109             r ~= "!(";
110 
111         CommaGen orCode = CommaGen(" || ");
112 
113         foreach (interval; set.byInterval)
114         {
115             r ~= orCode();
116             if (interval[0] + 1 == interval[1])
117                 r ~= text(varCode, " == '", interval[0].escapeCodePoint(false), "'");
118             else
119                 r ~= text("(", varCode, " >= '", interval[0].escapeCodePoint(false),
120                         "' && ", varCode, " <= '", (interval[1] - 1).escapeCodePoint(false), "')");
121         }
122         if (inverted)
123             r ~= ")";
124     }
125     return r;
126 }
127 
128 void generateTokenGraph(const EBNFGrammar lexerGrammar, Graph!(DCharToken, LexerAction) g,
129         CodepointSet*[] codepointSets, Symbol symbol, Graph!(DCharToken, LexerAction).NodeID start,
130         Graph!(DCharToken, LexerAction).NodeID end, EdgeFlags edgeFlags = EdgeFlags.none)
131 {
132     alias G = Graph!(DCharToken, LexerAction);
133     alias NodeID = G.NodeID;
134 
135     string name = lexerGrammar.getSymbolName(symbol);
136 
137     if (name.endsWith("\""))
138     {
139         assert(name[$ - 1] == name[0]);
140         name = name[1 .. $ - 1];
141         NodeID current = start;
142         size_t i;
143         UnescapedChar[] chars = name.byDchar.byUnescapedDchar.array;
144         if (edgeFlags != EdgeFlags.none)
145             enforce(chars.length);
146         foreach (k, d; chars)
147         {
148             scope (success)
149                 i++;
150             NodeID next = g.addNode(text("x ", i));
151 
152             EdgeFlags edgeFlags2;
153             if (k == chars.length - 1)
154             {
155                 edgeFlags2 = edgeFlags;
156             }
157 
158             CodepointSet set = CodepointSet(d.c, d.c + 1);
159             foreach (codepointSet; codepointSets)
160             {
161                 if (!(set & *codepointSet).empty)
162                     g.addEdge(current, next, DCharToken(codepointSet), edgeFlags2);
163             }
164             current = next;
165         }
166         g.addEdge(current, end, DCharToken.invalid);
167     }
168     else if (name.endsWith("]"))
169     {
170         assert(name[0] == '[');
171         name = name[1 .. $ - 1];
172         CodepointSet set = codepointSetFromStr(name);
173         foreach (codepointSet; codepointSets)
174         {
175             if (!(set & *codepointSet).empty)
176                 g.addEdge(start, end, DCharToken(codepointSet), edgeFlags);
177         }
178     }
179     else if (name.endsWith("\'"))
180     {
181         assert(name[$ - 1] == name[0]);
182         name = name[1 .. $ - 1];
183         throw new Exception("not implemented");
184     }
185     else
186         assert(false);
187 }
188 
189 void genGraphForNonterminal(const EBNFGrammar lexerGrammar, Graph!(DCharToken, LexerAction) g,
190         CodepointSet*[] codepointSets, NonterminalID currentNonterminal, Graph!(DCharToken, LexerAction)
191         .NodeID start, Graph!(DCharToken, LexerAction).NodeID end,
192         ref Graph!(DCharToken, LexerAction)[NonterminalID] graphByNonterminal)
193 {
194     alias G = Graph!(DCharToken, LexerAction);
195     alias NodeID = G.NodeID;
196 
197     bool[Symbol] done;
198     void genSubgraph(Symbol symbol, NodeID start, NodeID end,
199             EdgeFlags firstEdgeFlags = EdgeFlags.none)
200     {
201         assert(!symbol.isToken);
202         if (symbol in done && done[symbol])
203             throw new Exception(text("cycle ", lexerGrammar.getSymbolName(symbol)));
204         done[symbol] = true;
205         scope (success)
206             done[symbol] = false;
207 
208         string name = lexerGrammar.getSymbolName(symbol);
209 
210         if (name.endsWith("\"") || name.endsWith("]") || name.endsWith("\'"))
211         {
212             generateTokenGraph(lexerGrammar, g, codepointSets, symbol, start, end, firstEdgeFlags);
213         }
214         else if (name.startsWith("$tokenminus"))
215         {
216             size_t foundProductions;
217             foreach (p; lexerGrammar.productions)
218             {
219                 if (p.nonterminalID != symbol.toNonterminalID)
220                     continue;
221                 foundProductions++;
222 
223                 assert(p.symbols.length == 2);
224 
225                 G dfa2 = new G();
226                 dfa2.start = dfa2.addNode("start");
227 
228                 foreach (i, s; p.symbols)
229                 {
230                     if (s.isToken)
231                     {
232                         auto endM = dfa2.addNode("end");
233                         generateTokenGraph(lexerGrammar, dfa2, codepointSets,
234                                 s.toTokenID, dfa2.start, endM, EdgeFlags.none);
235                         dfa2.get(endM).results = [
236                             LexerAction(NonterminalID.invalid, i > 0)
237                         ];
238                     }
239                     else
240                     {
241                         auto prevNodesLength = dfa2.nodes.length;
242                         auto g1 = genGraphForNonterminal(lexerGrammar,
243                                 codepointSets, s.toNonterminalID, graphByNonterminal);
244 
245                         auto startM = dfa2.addCopy(g1)[0];
246                         foreach (j; prevNodesLength .. dfa2.nodes.length)
247                         {
248                             if (dfa2.nodes[j].results.length)
249                                 dfa2.nodes[j].results = [
250                                     LexerAction(NonterminalID.invalid, i > 0)
251                                 ];
252                         }
253                         dfa2.addEdge(dfa2.start, startM, DCharToken.invalid);
254                     }
255                 }
256 
257                 dfa2 = dfa2.makeDeterministic(dfa2.start);
258                 dfa2 = dfa2.minimizeDFA;
259                 foreach (j; 0 .. dfa2.nodes.length)
260                 {
261                     if (dfa2.nodes[j].results.length)
262                     {
263                         bool hasNotIgnored, hasIgnored;
264                         foreach (r; dfa2.nodes[j].results)
265                             if (r.isIgnoreToken)
266                                 hasIgnored = true;
267                             else
268                                 hasNotIgnored = true;
269                         if (!hasIgnored && hasNotIgnored)
270                             dfa2.nodes[j].results = [
271                                 LexerAction(NonterminalID.invalid)
272                             ];
273                         else
274                             dfa2.nodes[j].results = [];
275                     }
276                 }
277                 dfa2 = dfa2.makeDeterministic(dfa2.start);
278                 dfa2 = dfa2.removeDeadStates;
279                 dfa2 = dfa2.minimizeDFA;
280 
281                 enforce(firstEdgeFlags == EdgeFlags.none || dfa2.get(dfa2.start).results.length == 0);
282 
283                 auto prevNodesLength = g.nodes.length;
284 
285                 auto startM = g.addCopy(dfa2)[0];
286                 g.addEdge(start, startM, DCharToken.invalid);
287                 foreach (j; prevNodesLength .. g.nodes.length)
288                 {
289                     if (g.nodes[j].results.length)
290                     {
291                         g.nodes[j].results = [];
292                         g.addEdge(g.nodeID(j), end, DCharToken.invalid);
293                     }
294                 }
295                 foreach (ref e; g.get(startM).edges)
296                 {
297                     e.flags = firstEdgeFlags;
298                 }
299             }
300             assert(foundProductions == 1, text(name, " foundProductions=", foundProductions));
301         }
302         else
303         {
304             foreach (p; lexerGrammar.productions)
305             {
306                 if (p.nonterminalID != symbol.toNonterminalID)
307                     continue;
308 
309                 enforce(p.symbols.length != 1 || p.symbols[0] != symbol, text("Error: Symbol ",
310                         lexerGrammar.getSymbolName(symbol), " recursively uses just itself"));
311 
312                 NodeID startP = g.addNode("start " ~ lexerGrammar.productionString(p));
313                 NodeID endP = g.addNode("end " ~ lexerGrammar.productionString(p));
314                 g.addEdge(endP, end, DCharToken.invalid);
315 
316                 NodeID current = startP;
317                 bool connectStart = true, connectEnd = true;
318 
319                 if (p.annotations.contains!"recursiveLexer"())
320                 {
321                     enforce(firstEdgeFlags == EdgeFlags.none);
322 
323                     foreach (i, s; p.symbols)
324                     {
325                         NodeID next = g.addNode("x");
326                         if (s.isToken)
327                             generateTokenGraph(lexerGrammar, g, codepointSets, s, current, next);
328                         else
329                         {
330                             g.addEdge(current, next, DCharToken(null,
331                                     EdgeFlags.none, p.symbols[i .. $]));
332                             current = next;
333                             break;
334                         }
335                         current = next;
336                     }
337                     if (connectStart)
338                         g.addEdge(start, startP, DCharToken.invalid);
339                     if (connectEnd)
340                         g.addEdge(current, endP, DCharToken.invalid);
341                 }
342                 else
343                 {
344                     EdgeFlags nextEdgeFlags = firstEdgeFlags;
345                     foreach (i, s; p.symbols)
346                     {
347                         EdgeFlags nextEdgeFlags2 = nextEdgeFlags;
348                         nextEdgeFlags = EdgeFlags.none;
349                         if (s.annotations.contains!"store" || s.annotations.contains!"compareTrue"
350                                 || s.annotations.contains!"compareFalse")
351                         {
352                             enforce(i || nextEdgeFlags2 == EdgeFlags.none);
353                             enforce(s != symbol);
354                             //enforce(i + 1 < p.symbols.length);
355 
356                             G dfa2;
357                             if (s.isToken)
358                             {
359                                 dfa2 = new G();
360                                 dfa2.start = dfa2.addNode("start");
361                                 auto endM = dfa2.addNode("end");
362                                 generateTokenGraph(lexerGrammar, dfa2, codepointSets,
363                                         s.toTokenID, dfa2.start, endM, EdgeFlags.none);
364                                 dfa2.get(endM).results = [
365                                     LexerAction(NonterminalID.invalid, true)
366                                 ];
367                             }
368                             else
369                             {
370                                 dfa2 = genGraphForNonterminal(lexerGrammar,
371                                         codepointSets, s.toNonterminalID, graphByNonterminal);
372                             }
373 
374                             NodeID next = g.addNode("x");
375 
376                             auto prevNodesLength = g.nodes.length;
377                             auto startM = g.addCopy(dfa2)[0];
378                             auto newNodesLength = g.nodes.length;
379                             foreach (j; prevNodesLength .. newNodesLength)
380                             {
381                                 foreach (ref e; g.nodes[j].edges)
382                                 {
383                                     if (!s.annotations.contains!"store")
384                                         e.flags |= EdgeFlags.compareMiddle;
385                                 }
386                             }
387                             foreach (ref e; g.get(startM).edges)
388                             {
389                                 if (g.get(e.next).edges.length)
390                                     g.addEdge(current, e.next, e.symbol, s.annotations.contains!"store"
391                                             ? EdgeFlags.storeStart : EdgeFlags.compareStart);
392                                 if (g.get(e.next).results.length)
393                                     g.addEdge(current, next, e.symbol, s.annotations.contains!"store"
394                                             ? EdgeFlags.storeStart
395                                             : (EdgeFlags.compareStart | (s.annotations.contains!"compareTrue"
396                                                 ? EdgeFlags.compareEndTrue
397                                                 : EdgeFlags.compareEndFalse)));
398                             }
399                             foreach (j; prevNodesLength .. newNodesLength)
400                             {
401                                 foreach (e; g.nodes[j].edges)
402                                 {
403                                     if (g.get(e.next).results.length)
404                                     {
405                                         g.addEdge(g.nodeID(j), next, e.symbol,
406                                                 s.annotations.contains!"store" ? EdgeFlags.none
407                                                 : s.annotations.contains!"compareTrue"
408                                                 ? EdgeFlags.compareEndTrue
409                                                 : EdgeFlags.compareEndFalse);
410                                     }
411                                 }
412                             }
413                             foreach (j; prevNodesLength .. newNodesLength)
414                             {
415                                 g.nodes[j].results = [];
416                             }
417 
418                             if (s.annotations.contains!"store")
419                             {
420                                 NodeID next2 = g.addNode("x");
421                                 g.addEdge(next, next2, DCharToken(null, EdgeFlags.storeEnd));
422                                 next = next2;
423                             }
424 
425                             current = next;
426                         }
427                         else if (i == p.symbols.length - 1 && s == symbol
428                                 && firstEdgeFlags != EdgeFlags.none)
429                         {
430                             done[symbol] = false;
431                             NodeID next = g.addNode("x");
432                             if (s.isToken)
433                                 generateTokenGraph(lexerGrammar, g, codepointSets,
434                                         s, current, next, EdgeFlags.none);
435                             else
436                                 genSubgraph(s, current, next, EdgeFlags.none);
437                             current = next;
438                             done[symbol] = true;
439                         }
440                         else if (s == symbol)
441                         {
442                             if (i != 0 && i != p.symbols.length - 1)
443                                 throw new Exception("not regular");
444 
445                             if (i == 0)
446                             {
447                                 g.addEdge(end, current, DCharToken.invalid, EdgeFlags.backwards);
448                                 connectStart = false;
449                             }
450                             if (i == p.symbols.length - 1)
451                             {
452                                 g.addEdge(current, start, DCharToken.invalid, EdgeFlags.backwards);
453                                 connectEnd = false;
454                             }
455                         }
456                         else
457                         {
458                             NodeID next = g.addNode("x");
459                             if (s.isToken)
460                                 generateTokenGraph(lexerGrammar, g, codepointSets,
461                                         s, current, next, nextEdgeFlags2);
462                             else
463                                 genSubgraph(s, current, next, nextEdgeFlags2);
464                             current = next;
465                         }
466                     }
467                     enforce(nextEdgeFlags == EdgeFlags.none,
468                             text("Empty lexer rule used with flags ", nextEdgeFlags));
469 
470                     if (connectStart)
471                         g.addEdge(start, startP, DCharToken.invalid);
472                     if (connectEnd)
473                         g.addEdge(current, endP, DCharToken.invalid);
474                 }
475 
476                 foreach (x; p.negLookaheads)
477                 {
478                     auto endNegLookahead = g.addNode(
479                             "neglookahead " ~ lexerGrammar.productionString(p) ~ " " ~ text(x));
480                     if (x.isToken)
481                         generateTokenGraph(lexerGrammar, g, codepointSets, x,
482                                 endP, endNegLookahead);
483                     else
484                         genSubgraph(x, endP, endNegLookahead);
485                     g.get(endNegLookahead).results = [
486                         LexerAction(currentNonterminal, false, true)
487                     ];
488                 }
489             }
490         }
491     }
492 
493     genSubgraph(currentNonterminal, start, end);
494 }
495 
496 Graph!(DCharToken, LexerAction) genGraphForNonterminal(const EBNFGrammar lexerGrammar, CodepointSet*[] codepointSets,
497         NonterminalID currentNonterminal, ref Graph!(DCharToken,
498             LexerAction)[NonterminalID] graphByNonterminal)
499 {
500     alias G = Graph!(DCharToken, LexerAction);
501     alias NodeID = G.NodeID;
502 
503     if (currentNonterminal in graphByNonterminal)
504     {
505         enforce(graphByNonterminal[currentNonterminal]!is null);
506         return graphByNonterminal[currentNonterminal];
507     }
508 
509     graphByNonterminal[currentNonterminal] = null;
510 
511     G dfa2 = new G();
512     dfa2.start = dfa2.addNode("start");
513 
514     NodeID endM = dfa2.addNode("end");
515     dfa2.get(endM).results = [LexerAction(currentNonterminal)];
516 
517     genGraphForNonterminal(lexerGrammar, dfa2, codepointSets,
518             currentNonterminal, dfa2.start, endM, graphByNonterminal);
519 
520     dfa2 = dfa2.makeDeterministic(dfa2.start);
521     dfa2 = dfa2.minimizeDFA;
522     graphByNonterminal[currentNonterminal] = dfa2;
523     return dfa2;
524 }
525 
526 class SubAutomaton
527 {
528     immutable(SymbolInstance)[] recursiveSequence;
529     Graph!(DCharToken, LexerAction) dfa;
530 }
531 
532 void generateStateMachine(ref CodeWriter code, const EBNFGrammar lexerGrammar,
533         Graph!(DCharToken, LexerAction) dfa, bool inSubAutomaton, SubAutomaton[] subAutomatons)
534 {
535     alias G = Graph!(DCharToken, LexerAction);
536     alias NodeID = G.NodeID;
537 
538     class EdgeData
539     {
540         CodepointSet set;
541         const(SymbolInstance)[] recursiveSequence;
542         G.NodeID next, next2, next3;
543         LexerAction[] saveResult;
544         bool isEndEdge;
545         EdgeFlags flags;
546     }
547 
548     EdgeData[][G.NodeID] edgesPerNode;
549     CodepointSet[G.NodeID] allAllowedSets;
550     foreach (n; dfa.nodeIDs)
551     {
552         EdgeData[] edges;
553         CodepointSet allAllowedSet;
554         bool hasRecursiveLexer;
555         foreach (e; dfa.getEdges(n))
556         {
557             EdgeFlags flags = e.flags;
558             EdgeData x;
559             foreach (x2; edges)
560                 if (e.symbol.codepoints !is null && x2.set == *e.symbol.codepoints)
561                 {
562                     enforce((flags & ~EdgeFlags.compareFlags) == (x2.flags & ~EdgeFlags.compareFlags),
563                             text(n, flagsToString(flags), " ",
564                                 flagsToString(x2.flags), " ", x2.set));
565                     flags |= x2.flags;
566                     x = x2;
567                 }
568             if (x !is null)
569             {
570                 if (e.symbol.codepoints !is null)
571                     x.set |= *e.symbol.codepoints;
572                 x.recursiveSequence = e.symbol.recursiveSequence;
573                 x.flags |= flags;
574                 if (e.symbol.recursiveSequence.length)
575                     hasRecursiveLexer = true;
576                 allAllowedSet |= x.set;
577             }
578             else
579             {
580                 x = new EdgeData();
581                 if (e.symbol.codepoints !is null)
582                     x.set = *e.symbol.codepoints;
583                 x.recursiveSequence = e.symbol.recursiveSequence;
584                 if (e.symbol.recursiveSequence.length)
585                     hasRecursiveLexer = true;
586                 x.flags = flags;
587                 allAllowedSet |= x.set;
588                 edges ~= x;
589             }
590 
591             if (e.flags & EdgeFlags.compareEndTrue)
592             {
593                 x.next3 = e.next;
594             }
595             else if ((e.flags & EdgeFlags.compareFlags) && !(e.flags & EdgeFlags.compareEndFalse))
596             {
597                 x.next2 = e.next;
598             }
599             else
600             {
601                 x.next = e.next;
602             }
603         }
604         if (hasRecursiveLexer)
605         {
606             enforce(edges.length == 1);
607         }
608         else
609         {
610             edges ~= new EdgeData();
611             edges[$ - 1].set = allAllowedSet.inverted;
612             edges[$ - 1].isEndEdge = true;
613         }
614         edgesPerNode[n] = edges;
615         allAllowedSets[n] = allAllowedSet;
616 
617         /*foreach (e; edges)
618         {
619             writeln(n, e.next, e.next2, " ", codepointSetToStr(e.set), " ", e.recursiveSequence);
620         }*/
621     }
622 
623     // Merge similar edges
624     foreach (n; dfa.nodeIDs)
625     {
626         if (edgesPerNode[n].length == 1 && edgesPerNode[n][0].recursiveSequence.length)
627             continue;
628 
629         struct Key
630         {
631             G.NodeID next;
632             G.NodeID next2;
633             G.NodeID next3;
634             bool isEndEdge;
635             EdgeFlags flags;
636             int opCmp(const Key other) const
637             {
638                 if (isEndEdge != other.isEndEdge)
639                 {
640                     if (other.isEndEdge)
641                         return -1;
642                     if (isEndEdge)
643                         return 1;
644                 }
645                 if (flags < other.flags)
646                     return -1;
647                 if (flags > other.flags)
648                     return 1;
649                 int r = next.opCmp(other.next);
650                 if (r)
651                     return r;
652                 r = next2.opCmp(other.next2);
653                 if (r)
654                     return r;
655                 r = next3.opCmp(other.next3);
656                 return r;
657             }
658         }
659 
660         DCharToken[Key] edgesMap;
661         EdgeData[] edges;
662         foreach (e; edgesPerNode[n])
663         {
664             Key key = Key(e.next, e.next2, e.next3, e.isEndEdge, e.flags);
665             if (key !in edgesMap)
666             {
667                 edgesMap[key] = DCharToken(new CodepointSet());
668             }
669             *edgesMap[key].codepoints |= e.set;
670         }
671         foreach (key; edgesMap.sortedKeys)
672         {
673             auto e = edgesMap[key];
674             EdgeData e2 = new EdgeData();
675             e2.next = key.next;
676             e2.next2 = key.next2;
677             e2.next3 = key.next3;
678             e2.isEndEdge = key.isEndEdge;
679             e2.flags = key.flags;
680             e2.set = *e.codepoints;
681             edges ~= e2;
682         }
683         edgesPerNode[n] = edges;
684     }
685 
686     LexerAction[][G.NodeID] reachableResultsPerNode;
687     foreach (n; dfa.nodeIDs)
688     {
689         reachableResultsPerNode[n] = reachableResults(dfa, n);
690     }
691 
692     // Calculate, which results have to be saved, because it's
693     // not known if there will be a longer token.
694     foreach (n; dfa.nodeIDs)
695     {
696         foreach (e; edgesPerNode[n])
697         {
698             if (e.isEndEdge)
699                 continue;
700             bool[LexerAction] resNext;
701             bool[LexerAction] reachableNext;
702             if (e.next != NodeID.invalid)
703             {
704                 foreach (res; dfa.get(e.next).results)
705                 {
706                     resNext[res] = true;
707                 }
708                 foreach (res; reachableResultsPerNode[e.next])
709                 {
710                     reachableNext[res] = true;
711                 }
712             }
713             if (e.next2 != NodeID.invalid)
714             {
715                 bool[LexerAction] resNext2 = resNext;
716                 resNext = null;
717                 foreach (res; dfa.get(e.next2).results)
718                 {
719                     if (e.next == NodeID.invalid || res in resNext2)
720                         resNext[res] = true;
721                 }
722                 foreach (res; reachableResultsPerNode[e.next2])
723                 {
724                     reachableNext[res] = true;
725                 }
726             }
727             foreach (res; dfa.get(n).results)
728             {
729                 if (res.isNegLookahead)
730                     continue;
731                 if (LexerAction(res.nonterminal, res.isIgnoreToken, true) in reachableNext)
732                     continue;
733                 if (res !in resNext)
734                     e.saveResult.addOnce(res);
735             }
736         }
737     }
738 
739     CodepointSet[][G.NodeID] shortestPaths;
740     void findShortestPaths(CodepointSet[] path, G.NodeID node)
741     {
742         if (node !in shortestPaths || path.length < shortestPaths[node].length)
743         {
744             shortestPaths[node] = path;
745             CodepointSet[G.NodeID] edges;
746             foreach (e; edgesPerNode[node])
747             {
748                 if (e.isEndEdge)
749                     continue;
750                 if (e.next.id != size_t.max)
751                 {
752                     if (e.next !in edges)
753                         edges[e.next] = e.set;
754                     else
755                         edges[e.next] |= e.set;
756                 }
757                 if (e.next2.id != size_t.max)
758                 {
759                     if (e.next2 !in edges)
760                         edges[e.next2] = e.set;
761                     else
762                         edges[e.next2] |= e.set;
763                 }
764             }
765             foreach (n; edges.sortedKeys)
766             {
767                 findShortestPaths(path ~ edges[n], n);
768             }
769         }
770     }
771 
772     findShortestPaths([], dfa.start);
773 
774     bool[G.NodeID] nodesWithCompare;
775     foreach (n; dfa.nodeIDs)
776     {
777         foreach (e; dfa.getEdges(n))
778         {
779             if (e.flags & (EdgeFlags.compareStart | EdgeFlags.compareMiddle))
780                 nodesWithCompare[e.next] = true;
781         }
782     }
783 
784     void genCodeForNode(G.NodeID n)
785     {
786         string resultsComment;
787         bool[LexerAction] resultDone;
788         foreach (res; dfa.get(n).results)
789         {
790             resultDone[res] = true;
791             resultsComment ~= " " ~ (res.isNegLookahead
792                     ? "!" : "") ~ lexerGrammar.getSymbolName(res.nonterminal);
793         }
794         auto tmpReachableResults = reachableResultsPerNode[n];
795         sort!((a, b) => a.nonterminal.id < b.nonterminal.id)(tmpReachableResults);
796         foreach (res; tmpReachableResults)
797         {
798             if (res in resultDone)
799                 continue;
800             resultsComment ~= " (" ~ (res.isNegLookahead
801                     ? "!" : "") ~ lexerGrammar.getSymbolName(res.nonterminal) ~ ")";
802         }
803         code.writeln("//", resultsComment);
804 
805         code.write("// path:");
806         if (n in shortestPaths)
807             foreach (s; shortestPaths[n])
808             {
809                 code.write(" ", codepointSetToStr(s));
810             }
811         code.writeln();
812 
813         if (n == dfa.start)
814             code.decIndent.writeln("start:").incIndent;
815 
816         auto edges = edgesPerNode[n];
817         CodepointSet allAllowedSet = allAllowedSets[n];
818 
819         EdgeData findBestNextEdge(CodepointSet codepointsStillPossible)
820         {
821             EdgeData best;
822             foreach (e; edges)
823             {
824                 CodepointSet eSet = e.set & codepointsStillPossible;
825                 CodepointSet eSet2 = restrictToAssumedCodepoints(e.set, codepointsStillPossible);
826                 if (eSet.empty)
827                     continue;
828                 if (best is null)
829                 {
830                     best = e;
831                     continue;
832                 }
833                 CodepointSet bestSet = best.set & codepointsStillPossible;
834                 CodepointSet bestSet2 = restrictToAssumedCodepoints(best.set,
835                         codepointsStillPossible);
836 
837                 if (eSet2.byInterval.length < bestSet2.byInterval.length)
838                     best = e;
839                 else if (eSet2.byInterval.length > bestSet2.byInterval.length)
840                     continue;
841                 else if (eSet.length < bestSet.length)
842                     best = e;
843                 else if (eSet.length > bestSet.length)
844                     continue;
845                 else if (best.isEndEdge)
846                     best = e;
847                 else
848                 {
849                     auto eIntervals = eSet.byInterval;
850                     auto bestIntervals = bestSet.byInterval;
851                     while (!eIntervals.empty && !bestIntervals.empty)
852                     {
853                         if (eIntervals.front[0] < bestIntervals.front[0])
854                         {
855                             best = e;
856                             break;
857                         }
858                         else if (eIntervals.front[0] > bestIntervals.front[0])
859                         {
860                             break;
861                         }
862                         else if (eIntervals.front[1] < bestIntervals.front[1])
863                         {
864                             best = e;
865                             break;
866                         }
867                         else if (eIntervals.front[1] > bestIntervals.front[1])
868                         {
869                             break;
870                         }
871                         eIntervals.popFront;
872                         bestIntervals.popFront;
873                     }
874                 }
875             }
876             return best;
877         }
878 
879         int getNonterminalPriority(NonterminalID nonterminal)
880         {
881             if (nonterminal == NonterminalID.invalid)
882                 return 0;
883             if (lexerGrammar.nonterminals[nonterminal].annotations.contains!"ignoreToken"())
884                 return 1;
885             if (lexerGrammar.nonterminals[nonterminal].annotations.contains!"lowPrio"())
886                 return 2;
887             return 3;
888         }
889 
890         const(LexerAction)[] filterActionConflicts(const(LexerAction)[] resultsAll)
891         {
892             LexerAction[] best;
893 
894             const(LexerAction)[] results;
895             const(LexerAction)[] resultsNegLookahead;
896             foreach (res; resultsAll)
897                 if (res.isNegLookahead)
898                     resultsNegLookahead ~= res;
899 
900             foreach (res; resultsAll)
901                 if (!res.isNegLookahead)
902                 {
903                     bool hasNegLookahead;
904                     foreach (res2; resultsNegLookahead)
905                         if (res.nonterminal == res2.nonterminal)
906                             hasNegLookahead = true;
907                     if (!hasNegLookahead)
908                         results ~= res;
909                 }
910 
911             if (results.length <= 1)
912                 return results;
913 
914             foreach (res; results)
915             {
916                 int resPriority = getNonterminalPriority(res.nonterminal);
917                 int bestPriority = (best.length > 0) ? getNonterminalPriority(best[0].nonterminal)
918                     : 0;
919                 if (resPriority > bestPriority)
920                     best = [res];
921                 else if (resPriority == bestPriority)
922                     best ~= res; // conflict
923             }
924             if (best.length > 1
925                     && lexerGrammar.nonterminals[best[0].nonterminal].annotations.contains!"ignoreToken"())
926                 best.length = 1;
927             assert(best.length > 0);
928             return best;
929         }
930 
931         void genLexCode(bool ascii)
932         {
933             mixin(genCode("code", q{
934                 $$CodepointSet currentAllowedCodepoints = ascii ? unicode.ASCII : (unicode.ASCII.inverted);
935                 $$if (!ascii /*&& !(allAllowedSet & currentAllowedCodepoints).empty*/) {
936                     string inputCopyNext = inputCopy;
937                     import std.utf;
938 
939                     dchar currentDchar = decodeFront!(Yes.useReplacementDchar)(inputCopyNext);
940                 $$}
941                 $$bool needsElse = false;
942                 $$bool elseStillAllowed = true;
943                 $$CodepointSet codepointsStillPossible = currentAllowedCodepoints;
944                 $$while (true) {
945                     $$EdgeData e = findBestNextEdge(codepointsStillPossible);
946                     $$if (e is null) break;
947                     $$if ((e.set & codepointsStillPossible).empty) continue;
948                     $$assert(elseStillAllowed);
949                     $(needsElse?"else":"")  _
950                     $$if ((codepointsStillPossible - e.set).empty) {
951                         $("")
952                         $$elseStillAllowed = false;
953                     $$} else {
954                         $(needsElse?" ":"")if ($(codepointSetToCode(ascii?"currentChar":"currentDchar", e.set, codepointsStillPossible)))
955                     $$}
956                     {
957                         $$if (e.isEndEdge) {
958                             $$if (dfa.get(n).results.length > 0) {
959                                 goto endstate$(n.id);
960                             $$} else if ((allAllowedSet.inverted & currentAllowedCodepoints).empty) {
961                                 assert(false);
962                             $$} else {
963                                 $$if (inSubAutomaton) {
964                                     if (hasPreviousSymbol)
965                                         return false;
966                                 $$} else {
967                                     if (foundSymbol != SymbolID.max)
968                                         goto lexerend;
969                                 $$}
970                                 else
971                                     throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
972                             $$}
973                         $$} else {
974                             $$needsElse = true;
975                             $$if (e.saveResult.length > 0) {
976                                 $$auto filteredActionConflicts2 = filterActionConflicts(e.saveResult);
977                                 $$if (filteredActionConflicts2.length > 1) {
978                                     foundSymbol = SymbolID.max;
979                                 $$} else {
980                                     assert(inputCopy.ptr >= input.ptr);
981                                     foundSymbol = tokenID!"$(lexerGrammar.getSymbolName(filteredActionConflicts2[0].nonterminal).escapeD)";
982                                     foundLength = inputCopy.ptr - input.ptr;
983                                     foundIsIgnore = $(filteredActionConflicts2[0].isIgnoreToken);
984                                 $$}
985                             $$}
986                             $$if (e.flags & EdgeFlags.storeEnd) {
987                                 assert(storedStart != size_t.max);
988                                 storedString = input[storedStart .. inputCopy.ptr - input.ptr];
989                                 storedStart = size_t.max;
990                             $$}
991                             $$if (e.flags & (EdgeFlags.compareStart | EdgeFlags.storeStart)) {
992                                 assert(storedStart == size_t.max);
993                                 storedStart = inputCopy.ptr - input.ptr;
994                             $$}
995                             $$if (e.flags & EdgeFlags.compareFlags) {
996                                 assert(storedStart != size_t.max);
997                                 string compareString = input[storedStart .. $(ascii ? "inputCopy.ptr + 1": "inputCopyNext.ptr") - input.ptr];
998                                 bool currentCharCorrect = compareString.length <= storedString.length && $(ascii?"currentChar == storedString[compareString.length - 1]" : "inputCopy[0 .. inputCopyNext.ptr - inputCopy.ptr] == storedString[compareString.length - (inputCopyNext.ptr - inputCopy.ptr)..compareString.length]");
999                             $$}
1000                             $$if (ascii) {
1001                                 inputCopy = inputCopy[1 .. $];
1002                             $$} else {
1003                                 inputCopy = inputCopyNext;
1004                             $$}
1005                             $$if (e.flags & (EdgeFlags.compareFlags)) {
1006                                 if (compareString.length == storedString.length && currentCharCorrect)
1007                                 {
1008                                     $$if (e.flags & (EdgeFlags.compareEndTrue | EdgeFlags.compareEndFalse)) {
1009                                         assert(compareString == storedString);
1010                                         storedStart = size_t.max;
1011                                     $$}
1012                                     $$if (e.next3.id == size_t.max) {
1013                                         throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
1014                                     $$} else {
1015                                         goto state$(e.next3.id);
1016                                     $$}
1017                                 }
1018                                 else if (compareString.length < storedString.length && currentCharCorrect)
1019                                 {
1020                                     $$if (e.next2.id == size_t.max) {
1021                                         throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
1022                                     $$} else {
1023                                         goto state$(e.next2.id);
1024                                     $$}
1025                                 }
1026                                 else
1027                                 {
1028                                     storedStart = size_t.max;
1029                                     $$if (e.next.id == size_t.max) {
1030                                         throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
1031                                     $$} else {
1032                                         goto state$(e.next.id);
1033                                     $$}
1034                                 }
1035                             $$} else {
1036                                 $$if (n in nodesWithCompare) {
1037                                     storedStart = size_t.max;
1038                                 $$}
1039                                 goto state$(e.next.id);
1040                             $$}
1041                         $$}
1042                     }
1043                     $$codepointsStillPossible = codepointsStillPossible - e.set;
1044                 $$}
1045                 $$assert(!elseStillAllowed);
1046                 }));
1047         }
1048 
1049         code.writeln("{").incIndent;
1050         if (edges.length == 1 && edges[0].recursiveSequence.length)
1051         {
1052             code.writeln("// RecursiveLexer");
1053             SubAutomaton a;
1054             size_t i;
1055             foreach (i2, a2; subAutomatons)
1056                 if (a2.recursiveSequence == edges[0].recursiveSequence)
1057                 {
1058                     a = a2;
1059                     i = i2;
1060                     break;
1061                 }
1062             code.writeln("if (lexPart", i, "(inputCopy, ", inSubAutomaton
1063                     ? "hasPreviousSymbol" : "foundSymbol != SymbolID.max", "))");
1064             code.writeln("    goto state", edges[0].next.id, ";");
1065             code.writeln("else");
1066             if (inSubAutomaton)
1067                 code.writeln("    return false;");
1068             else
1069                 code.writeln("    goto lexerend;");
1070         }
1071         else
1072         {
1073             mixin(genCode("code", q{
1074                 if (inputCopy.length == 0)
1075                 $$if (dfa.get(n).results.length >= 1) {
1076                         $$if (inSubAutomaton) {
1077                             $$enforce(dfa.getEdges(n).length == 0);
1078                             return true;
1079                         $$} else {
1080                             goto endstate$(n.id);
1081                         $$}
1082                 $$} else {
1083                     {
1084                         $$if (inSubAutomaton) {
1085                             if (input.ptr == inputCopy.ptr)
1086                                 return false;
1087                             else if (hasPreviousSymbol)
1088                                 return false;
1089                         $$} else {
1090                             if (input.ptr == inputCopy.ptr)
1091                                 goto lexerend;
1092                             else if (foundSymbol != SymbolID.max)
1093                                 goto lexerend;
1094                         $$}
1095                         else
1096                             throw lexerException("EOF", "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
1097                     }
1098                 $$}
1099                 $$if (!allAllowedSet.empty) {
1100                     char currentChar = inputCopy[0];
1101                     if (currentChar < 0x80)
1102                     {
1103                         $$genLexCode(true);
1104                     }
1105                     else
1106                     {
1107                         $$genLexCode(false);
1108                     }
1109                 $$} else {
1110                     $$if (dfa.get(n).results.length > 0) {
1111                         $$if (inSubAutomaton) {
1112                             return true;
1113                         $$} else {
1114                             goto endstate$(n.id);
1115                         $$}
1116                     $$} else {
1117                         $$if (inSubAutomaton) {
1118                             if (hasPreviousSymbol)
1119                                 return false;
1120                         $$} else {
1121                             if (foundSymbol != SymbolID.max)
1122                                 goto lexerend;
1123                         $$}
1124                         else
1125                             throw lexerException("Error", "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr);
1126                     $$}
1127                 $$}
1128                 }));
1129         }
1130         code.decIndent.writeln("}");
1131 
1132         if (dfa.get(n).results.length > 0)
1133             code.writeln("endstate", n.id, ":");
1134 
1135         void genEndState(const(LexerAction)[] results)
1136         {
1137             auto filteredActionConflicts = filterActionConflicts(results);
1138 
1139             if (filteredActionConflicts.length == 0)
1140             {
1141                 mixin(genCode("code", q{
1142                     {
1143                         if (foundSymbol != SymbolID.max)
1144                             goto lexerend;
1145                         else
1146                             throw lexerException("Error", "", inputCopy.ptr - input.ptr);
1147                     }
1148                     }));
1149             }
1150             else if (filteredActionConflicts.length > 1)
1151             {
1152                 auto conflictNames = filteredActionConflicts.map!(
1153                         res => lexerGrammar.getSymbolName(res.nonterminal).escapeD).join(' ');
1154                 stderr.writeln("Warning: Lexer conflict for ", conflictNames);
1155                 mixin(genCode("code", q{
1156                         throw lexerException("conflict $(conflictNames)", "", inputCopy.ptr - input.ptr);
1157                     }));
1158             }
1159             else if (filteredActionConflicts.length == 1)
1160             {
1161                 mixin(genCode("code", q{
1162                     {
1163                         assert(inputCopy.ptr >= input.ptr);
1164                         foundSymbol = tokenID!"$(lexerGrammar.getSymbolName(filteredActionConflicts[0].nonterminal).escapeD)";
1165                         foundLength = inputCopy.ptr - input.ptr;
1166                         foundIsIgnore = $(filteredActionConflicts[0].isIgnoreToken);
1167                         goto lexerend;
1168                     }
1169                     }));
1170             }
1171         }
1172 
1173         void genEndStateRek(const(LexerAction)[] resultsIn, const(LexerAction)[] results)
1174         {
1175             if (resultsIn.length == 0)
1176                 genEndState(results);
1177             else if (
1178                 lexerGrammar.nonterminals[resultsIn[0].nonterminal].annotations.contains!"inContextOnly"())
1179             {
1180                 mixin(genCode("code", q{
1181                         if (allowToken!$(lexerGrammar.nonterminals[resultsIn[0].nonterminal].name.tokenDCode))
1182                         $$genEndStateRek(resultsIn[1 .. $], results ~ resultsIn[0]);
1183                         else
1184                         $$genEndStateRek(resultsIn[1 .. $], results);
1185                     }));
1186             }
1187             else
1188                 genEndStateRek(resultsIn[1 .. $], results ~ resultsIn[0]);
1189         }
1190 
1191         if (!inSubAutomaton && dfa.get(n).results.length > 0)
1192             genEndStateRek(dfa.get(n).results, []);
1193 
1194         code.writeln();
1195     }
1196 
1197     foreach (n; dfa.nodeIDs)
1198     {
1199         code.decIndent.writeln("state", n.id, ":").incIndent;
1200         genCodeForNode(n);
1201     }
1202 }
1203 
1204 void removeMinimalMatchEdges(Graph!(DCharToken, LexerAction) g, EBNFGrammar lexerGrammar)
1205 {
1206     foreach (n; g.nodeIDs)
1207     {
1208         if (g.get(n).results.length == 0)
1209             continue;
1210         bool allMinimalMatch = true;
1211         foreach (res; g.get(n).results)
1212             if (!lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"minimalMatch")
1213                 allMinimalMatch = false;
1214         if (allMinimalMatch)
1215         {
1216             g.get(n).edges = [];
1217         }
1218     }
1219 }
1220 
1221 const(char)[] createLexerCode(EBNFGrammar lexerGrammar, string modulename, string dfafilename)
1222 {
1223     CodeWriter code;
1224     code.indentStr = "    ";
1225 
1226     string allTokensCode = createAllTokensCode(lexerGrammar);
1227     string allNonterminalsCode = createAllNonterminalsCode(lexerGrammar);
1228 
1229     alias G = Graph!(DCharToken, LexerAction);
1230     alias NodeID = G.NodeID;
1231 
1232     G graph = new G();
1233 
1234     string symbolName(Symbol s)
1235     {
1236         return lexerGrammar.getSymbolName(s);
1237     }
1238 
1239     string tokenName(DCharToken s)
1240     {
1241         return codepointSetToStr(*s.codepoints);
1242     }
1243 
1244     string edgeText(G.Edge[] edges)
1245     {
1246         string r;
1247 
1248         CodepointSet set;
1249 
1250         bool hasEmpty;
1251 
1252         foreach (e; edges)
1253         {
1254             if (e.symbol.recursiveSequence.length)
1255                 r ~= text("RecursiveLexer");
1256             else if (e.symbol.codepoints is null)
1257                 hasEmpty = true;
1258             else
1259                 set |= *e.symbol.codepoints;
1260         }
1261 
1262         if (hasEmpty)
1263         {
1264             r ~= "ε";
1265         }
1266 
1267         if (!hasEmpty || !set.empty)
1268         {
1269             if (r.length > 0)
1270                 r ~= "\n";
1271 
1272             r ~= codepointSetToStr(set);
1273         }
1274         EdgeFlags combinedFlags;
1275         foreach (e; edges)
1276         {
1277             combinedFlags |= e.flags | e.symbol.specialFlags;
1278         }
1279         if (combinedFlags & EdgeFlags.storeStart)
1280             r ~= text("\nStoreStart");
1281         if (combinedFlags & EdgeFlags.storeEnd)
1282             r ~= text("\nStoreEnd");
1283         if (combinedFlags & EdgeFlags.compareStart)
1284             r ~= text("\nCompareStart");
1285         if (combinedFlags & EdgeFlags.compareEndTrue)
1286             r ~= text("\nCompareEndTrue");
1287         if (combinedFlags & EdgeFlags.compareEndFalse)
1288             r ~= text("\nCompareEndFalse");
1289         if (combinedFlags & EdgeFlags.compareMiddle)
1290             r ~= text("\nCompareMiddle");
1291         return r;
1292     }
1293 
1294     string resultsText(G graph, G.NodeID node)
1295     {
1296         string r;
1297 
1298         string resultName(LexerAction res)
1299         {
1300             string r;
1301             if (res.nonterminal != NonterminalID.invalid)
1302             {
1303                 if (res.isNegLookahead)
1304                     r ~= "!";
1305                 r ~= text(lexerGrammar.getSymbolName(res.nonterminal));
1306                 if (
1307                     lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"ignoreToken"())
1308                     r ~= " IgnoreToken";
1309             }
1310             else
1311                 r ~= "invalid";
1312 
1313             if (res.isIgnoreToken)
1314                 r ~= " ignore";
1315             return r;
1316         }
1317 
1318         bool[LexerAction] done;
1319         foreach (res; graph.get(node).results)
1320         {
1321             done[res] = true;
1322             r ~= text(resultName(res), "\n");
1323         }
1324         foreach (res; reachableResults(graph, node))
1325         {
1326             if (res in done)
1327                 continue;
1328             r ~= text("(", resultName(res), ")\n");
1329         }
1330 
1331         return r;
1332     }
1333 
1334     CodepointSet*[] codepointSets;
1335     void addCodepointSet(CodepointSet set)
1336     {
1337         for (size_t i = 0; i < codepointSets.length; i++)
1338         {
1339             if (set == *codepointSets[i])
1340                 return;
1341             if (!(set & *codepointSets[i]).empty)
1342             {
1343                 CodepointSet both = set & *codepointSets[i];
1344                 CodepointSet first = *codepointSets[i] - set;
1345                 set = set - *codepointSets[i];
1346                 *codepointSets[i] = both;
1347                 if (!first.empty)
1348                     codepointSets ~= new CodepointSet(first);
1349             }
1350         }
1351         if (!set.empty)
1352         {
1353             codepointSets ~= new CodepointSet(set);
1354         }
1355     }
1356 
1357     void updateCodepointSets(string name)
1358     {
1359         if (name.endsWith("\""))
1360         {
1361             assert(name[$ - 1] == name[0], name);
1362             name = name[1 .. $ - 1];
1363             foreach (d; name.byDchar.byUnescapedDchar)
1364             {
1365                 addCodepointSet(CodepointSet(d.c, d.c + 1));
1366             }
1367         }
1368         else if (name.endsWith("]"))
1369         {
1370             assert(name[0] == '[', name);
1371             name = name[1 .. $ - 1];
1372             CodepointSet set = codepointSetFromStr(name);
1373             addCodepointSet(set);
1374         }
1375         else if (name.endsWith("\'"))
1376         {
1377             throw new Exception("not implemented");
1378         }
1379     }
1380 
1381     foreach (n; lexerGrammar.startNonterminals)
1382     {
1383         string name = lexerGrammar.nonterminals[n.nonterminal].name;
1384         updateCodepointSets(name);
1385     }
1386     foreach (p; lexerGrammar.productions)
1387     {
1388         foreach (i, s; p.symbols)
1389         {
1390             string name = lexerGrammar.getSymbolName(s);
1391             updateCodepointSets(name);
1392         }
1393     }
1394 
1395     {
1396         CodepointSet combined;
1397         foreach (set; codepointSets)
1398         {
1399             assert((combined & *set).empty);
1400             combined |= *set;
1401         }
1402     }
1403 
1404     graph.start = graph.addNode("start");
1405 
1406     Graph!(DCharToken, LexerAction)[NonterminalID] graphByNonterminal;
1407 
1408     foreach (n; lexerGrammar.nonterminals.allIDs())
1409     {
1410         if (!lexerGrammar.nonterminals[n].annotations.contains!"ignoreToken"())
1411             continue;
1412 
1413         string name = lexerGrammar.nonterminals[n].name;
1414         NodeID startM = graph.addNode("start " ~ name);
1415         graph.addEdge(graph.start, startM, DCharToken.invalid);
1416         NodeID endM = graph.addNode("end");
1417         graph.get(endM).results = [LexerAction(n, true)];
1418 
1419         genGraphForNonterminal(lexerGrammar, graph, codepointSets, n, startM,
1420                 endM, graphByNonterminal);
1421     }
1422 
1423     foreach (n; lexerGrammar.startNonterminals)
1424     {
1425         string name = lexerGrammar.nonterminals[n.nonterminal].name;
1426         NodeID startM = graph.addNode("start " ~ name);
1427         graph.addEdge(graph.start, startM, DCharToken.invalid);
1428         NodeID endM = graph.addNode("end");
1429         graph.get(endM).results = [LexerAction(n.nonterminal)];
1430 
1431         genGraphForNonterminal(lexerGrammar, graph, codepointSets,
1432                 n.nonterminal, startM, endM, graphByNonterminal);
1433     }
1434 
1435     foreach (n; graph.nodeIDs)
1436     {
1437         graph.get(n).name = text(n.id);
1438     }
1439 
1440     graph = graph.makeDeterministic(graph.start);
1441 
1442     foreach (n; graph.nodeIDs)
1443     {
1444         G.Node node = graph.get(n);
1445         G.Edge[] edges = node.edges;
1446         node.edges = [];
1447 
1448         foreach (e; edges)
1449         {
1450             assert((e.symbol.codepoints !is null) + (
1451                     e.symbol.specialFlags != EdgeFlags.none) + (
1452                     e.symbol.recursiveSequence.length != 0) == 1);
1453             if (e.symbol.codepoints is null && e.symbol.recursiveSequence.length == 0)
1454             {
1455                 assert(e.next != n);
1456                 foreach (e2; graph.get(e.next).edges)
1457                 {
1458                     assert(e2.symbol.codepoints !is null);
1459                     graph.addEdge(n, e2.next, e2.symbol,
1460                             e2.flags | e.symbol.specialFlags, e2.extra);
1461                 }
1462             }
1463             else
1464             {
1465                 node.edges ~= e;
1466             }
1467         }
1468     }
1469     graph = graph.makeDeterministic(graph.start);
1470 
1471     foreach (res; graph.get(graph.start).results)
1472     {
1473         if (lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"ignoreToken"())
1474             continue;
1475         stderr.writeln("Warning: Token ",
1476                 lexerGrammar.getSymbolName(res.nonterminal), " could be empty");
1477     }
1478 
1479     // Make sure, no token can be empty.
1480     {
1481         auto oldStart = graph.start;
1482         graph.start = graph.addNode("start");
1483         graph.get(graph.start).edges = graph.get(oldStart).edges.dup;
1484         graph = graph.makeDeterministic(graph.start);
1485     }
1486 
1487     graph = graph.minimizeDFA;
1488 
1489     graph.removeMinimalMatchEdges(lexerGrammar);
1490     graph = graph.minimizeDFA;
1491 
1492     if (dfafilename.length)
1493         graph.toGraphViz2!(edgeText, resultsText)(dfafilename);
1494 
1495     SubAutomaton[] subAutomatons;
1496     void addSubAutomatons(G graph)
1497     {
1498         foreach (n; graph.nodeIDs)
1499         {
1500             foreach (e; graph.getEdges(n))
1501             {
1502                 if (e.symbol.recursiveSequence.length)
1503                 {
1504                     size_t i;
1505                     for (i = 0; i < subAutomatons.length; i++)
1506                     {
1507                         if (subAutomatons[i].recursiveSequence == e.symbol.recursiveSequence)
1508                             break;
1509                     }
1510                     if (i >= subAutomatons.length)
1511                     {
1512                         SubAutomaton a = new SubAutomaton;
1513                         a.recursiveSequence = e.symbol.recursiveSequence;
1514                         subAutomatons ~= a;
1515                     }
1516                 }
1517             }
1518         }
1519     }
1520 
1521     addSubAutomatons(graph);
1522 
1523     EBNFGrammar grammar2 = new EBNFGrammar(null);
1524     grammar2.nonterminals = lexerGrammar.nonterminals.dup;
1525     grammar2.tokens = lexerGrammar.tokens.dup;
1526     grammar2.productionsData = lexerGrammar.productionsData;
1527 
1528     for (size_t i = 0; i < subAutomatons.length; i++)
1529     {
1530         NonterminalID n = grammar2.nonterminals.id(text("$m_", i));
1531         grammar2.addProduction(new Production(n, subAutomatons[i].recursiveSequence));
1532 
1533         auto graph2 = genGraphForNonterminal(grammar2, codepointSets, n, graphByNonterminal);
1534 
1535         subAutomatons[i].dfa = graph2;
1536         addSubAutomatons(subAutomatons[i].dfa);
1537     }
1538 
1539     mixin(genCode("code", q{
1540         // Generated with DParserGen.
1541         module $(modulename);
1542         import dparsergen.core.grammarinfo;
1543         import dparsergen.core.parseexception;
1544         import std.conv;
1545         import std.string;
1546         import std.typecons;
1547 
1548         enum SymbolID startTokenID = $(lexerGrammar.startTokenID);
1549         static assert(allNonterminalTokens.length < SymbolID.max - startTokenID);
1550         enum SymbolID endTokenID = startTokenID + allNonterminalTokens.length;
1551 
1552         SymbolID getTokenID(string tok)
1553         {
1554             foreach (i; 0 .. allNonterminalTokens.length)
1555                 if (allNonterminalTokens[i].name == tok)
1556                     return cast(SymbolID)(startTokenID + i);
1557             return SymbolID.max;
1558         }
1559 
1560         struct Lexer(Location, bool includeIgnoredTokens = false)
1561         {
1562             alias LocationDiff = typeof(Location.init - Location.init);
1563             string input;
1564             this(string input, Location startLocation = Location.init)
1565             {
1566                 this.input = input;
1567                 this.front.currentLocation = startLocation;
1568                 popFront;
1569             }
1570 
1571             enum tokenID(string tok) = getTokenID(tok);
1572             string tokenName(size_t id)
1573             {
1574                 return allNonterminalTokens[id - startTokenID].name;
1575             }
1576 
1577             static struct Front
1578             {
1579                 string content;
1580                 SymbolID symbol;
1581                 Location currentLocation;
1582                 static if (includeIgnoredTokens)
1583                     bool isIgnoreToken;
1584 
1585                 Location currentTokenEnd()
1586                 {
1587                     return currentLocation + LocationDiff.fromStr(content);
1588                 }
1589             }
1590 
1591             Front front;
1592             bool empty;
1593 
1594             $$foreach (t; lexerGrammar.nonterminals.allIDs) {
1595                 $$if (lexerGrammar.nonterminals[t].annotations.contains!"inContextOnly"()) {
1596                     // InContextOnly: $(lexerGrammar.nonterminals[t].name)
1597                     bool allowTokenId$(t.id);
1598                     bool allowToken(string tok)() if (tok == $(lexerGrammar.nonterminals[t].name.tokenDCode))
1599                     {
1600                         return allowTokenId$(t.id);
1601                     }
1602                     void allowToken(string tok)(bool b) if (tok == $(lexerGrammar.nonterminals[t].name.tokenDCode))
1603                     {
1604                         allowTokenId$(t.id) = b;
1605                     }
1606 
1607                 $$}
1608             $$}
1609             void popFront()
1610             {
1611                 input = input[front.content.length .. $];
1612                 front.currentLocation = front.currentLocation + LocationDiff.fromStr(front.content);
1613 
1614                 popFrontImpl();
1615             }
1616 
1617             void popFrontImpl()
1618             {
1619                 size_t foundLength;
1620                 SymbolID foundSymbol = SymbolID.max;
1621                 bool foundIsIgnore;
1622 
1623                 string inputCopy = input;
1624 
1625                 size_t storedStart = size_t.max;
1626                 string storedString;
1627 
1628                 goto start;
1629 
1630                 $$generateStateMachine(code, lexerGrammar, graph, false, subAutomatons);
1631                 lexerend:
1632 
1633                 if (foundSymbol != SymbolID.max)
1634                 {
1635                     if (foundLength == 0)
1636                     {
1637                         if (!inputCopy.empty)
1638                             throw lexerException("no token", null, inputCopy.ptr + 1 - input.ptr);
1639                         else
1640                         {
1641                             front.content = "";
1642                             front.symbol = SymbolID(0);
1643                             static if (includeIgnoredTokens)
1644                                 front.isIgnoreToken = false;
1645                             empty = true;
1646                             return;
1647                         }
1648                     }
1649                     static if (!includeIgnoredTokens)
1650                     {
1651                         if (foundIsIgnore)
1652                         {
1653                             front.currentLocation = front.currentLocation + LocationDiff.fromStr(input[0 .. foundLength]);
1654                             input = input[foundLength .. $];
1655                             inputCopy = input;
1656                             foundLength = 0;
1657                             foundIsIgnore = false;
1658                             foundSymbol = SymbolID.max;
1659                             storedStart = size_t.max;
1660                             storedString = null;
1661                             goto start;
1662                         }
1663                     }
1664                     front.content = input[0 .. foundLength];
1665                     front.symbol = foundSymbol;
1666                     static if (includeIgnoredTokens)
1667                         front.isIgnoreToken = foundIsIgnore;
1668                     empty = false;
1669                     return;
1670                 }
1671                 else if (input.length == 0)
1672                 {
1673                     front.content = "";
1674                     front.symbol = SymbolID(0);
1675                     static if (includeIgnoredTokens)
1676                         front.isIgnoreToken = false;
1677                     empty = true;
1678                     return;
1679                 }
1680                 else
1681                     throw lexerException("no token", null, 1);
1682             }
1683 
1684             $$foreach (i, a; subAutomatons) {
1685                 // $(lexerGrammar.symbolInstanceToString(a.recursiveSequence))
1686                 private bool lexPart$(i)(ref string inputCopy, bool hasPreviousSymbol)
1687                 {
1688                     $$generateStateMachine(code, grammar2, subAutomatons[i].dfa, true, subAutomatons);
1689                     assert(false);
1690                 }
1691 
1692             $$}
1693             SingleParseException!Location lexerException(string errorText, string expected, size_t len,
1694                     string file = __FILE__, size_t line = __LINE__)
1695             {
1696                 string str = errorText;
1697                 if (expected.length)
1698                     str ~= ", expected " ~ expected;
1699                 return new SingleParseException!Location(str, front.currentLocation, front.currentLocation, file, line);
1700             }
1701         }
1702 
1703         immutable allNonterminalTokens = [
1704         $(allNonterminalsCode)];
1705         }));
1706 
1707     return code.data;
1708 }