1 
2 //          Copyright Tim Schendekehl 2023.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          https://www.boost.org/LICENSE_1_0.txt)
6 
7 module dparsergen.generator.regexlookahead;
8 import dparsergen.core.utils;
9 import dparsergen.generator.codewriter;
10 import dparsergen.generator.grammar;
11 import dparsergen.generator.nfa;
12 import dparsergen.generator.parser;
13 import dparsergen.generator.parsercodegencommon;
14 import dparsergen.generator.production;
15 import std.algorithm;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std.stdio;
20 import std.typecons;
21 
22 NonterminalID normalizeNonterminalID(EBNFGrammar grammar, NonterminalID startNonterminal)
23 {
24     NonterminalID origStartNonterminal = startNonterminal;
25     while (true)
26     {
27         auto productions = grammar.getProductions(startNonterminal);
28         if (productions.length != 1)
29             break;
30         if (productions[0].symbols.length != 1)
31             break;
32         if (productions[0].symbols[0].isToken)
33             break;
34         startNonterminal = productions[0].symbols[0].toNonterminalID;
35         if (startNonterminal == origStartNonterminal)
36             break;
37     }
38 
39     return startNonterminal;
40 }
41 
42 struct RegexLookaheadEdgeExtra
43 {
44     immutable(SymbolInstance)[][] sequences;
45 
46     void opOpAssign(string op)(RegexLookaheadEdgeExtra rhs) if (op == "~")
47     {
48         sequences.addOnce(rhs.sequences);
49     }
50 }
51 
52 class RegexLookaheadGraph
53 {
54     size_t id;
55     bool isInnerGraph;
56     Tuple!(immutable(SymbolInstance)[], size_t)[] sequences;
57     Graph!(Symbol, size_t, RegexLookaheadEdgeExtra) dfa;
58     bool frozen;
59 }
60 
61 class RegexLookahead
62 {
63     alias G = Graph!(Symbol, size_t, RegexLookaheadEdgeExtra);
64     alias NodeID = G.NodeID;
65     EBNFGrammar grammar;
66     const bool[ProductionID] reduceConflictProductions;
67     EBNFGrammar grammar2;
68     bool useRegexlookahead;
69 
70     RegexLookaheadGraph[] graphs;
71     RegexLookaheadGraph[immutable(SymbolInstance[])[]] innerGraphs;
72 
73     bool grammarFinished;
74 
75     immutable(SymbolInstance)[][TokenID] subGraphTokens;
76     TokenID subGraphToken;
77 
78     TokenID[][TokenID] matchingTokensMap;
79     bool[TokenID] matchingTokensAll;
80 
81     this(EBNFGrammar grammar, const bool[ProductionID] reduceConflictProductions,
82             bool useRegexlookahead)
83     {
84         this.grammar = grammar;
85         this.reduceConflictProductions = reduceConflictProductions;
86         this.useRegexlookahead = useRegexlookahead;
87 
88         foreach (m; grammar.matchingTokens)
89         {
90             matchingTokensMap[m[0]] ~= m[1];
91             matchingTokensAll[m[0]] = true;
92             matchingTokensAll[m[1]] = true;
93         }
94 
95         grammar2 = new EBNFGrammar(null);
96         grammar2.nonterminals = grammar.nonterminals.dup;
97         grammar2.tokens = grammar.tokens.dup;
98         grammar2.productionsData = grammar.productionsData;
99         grammar2.matchingTokens = grammar.matchingTokens.dup;
100 
101         subGraphToken = grammar2.tokens.id("$subGraph");
102         grammar2.tokens.id("$anything");
103 
104         const(Production)*[] productionsData;
105         productionsData.length = grammar.productionsData.length;
106         foreach (i, p; grammar.productionsData)
107         {
108             if (p is null)
109                 continue;
110             immutable(SymbolInstance)[] symbols = p.symbols;
111             for (size_t j = 0; j < symbols.length; j++)
112             {
113                 if (j && symbols[j - 1].isToken)
114                 {
115                     size_t k = j;
116                     while (k < symbols.length && !symbols[k].isToken)
117                         k++;
118                     if (k >= symbols.length)
119                         break;
120                     assert(symbols[k].isToken);
121 
122                     TokenID startToken = symbols[j - 1].toTokenID;
123                     bool foundMatching;
124                     for (; k < symbols.length; k++)
125                     {
126                         if (!symbols[k].isToken)
127                             continue;
128                         TokenID endToken = symbols[k].toTokenID;
129                         foreach (matching; grammar.matchingTokens)
130                         {
131                             if (startToken == matching[0] && endToken == matching[1])
132                                 foundMatching = true;
133                         }
134                         if (foundMatching)
135                             break;
136                     }
137                     if (!foundMatching)
138                         continue;
139 
140                     string subSymbolsName = "$subSymbols(";
141                     immutable(SymbolInstance)[] subSymbols;
142                     foreach (s; symbols[j .. k])
143                     {
144                         if (!s.isToken)
145                             subSymbols ~= SymbolInstance(normalizeNonterminalID(grammar,
146                                     s.symbol.toNonterminalID));
147                         else
148                             subSymbols ~= SymbolInstance(s.symbol);
149                         subSymbolsName ~= text(grammar.getSymbolName(s), ", ");
150                     }
151                     subSymbolsName ~= ")";
152 
153                     TokenID t = grammar2.tokens.id(subSymbolsName);
154                     subGraphTokens[t] = subSymbols;
155                     symbols = symbols[0 .. j] ~ SymbolInstance(t) ~ symbols[k .. $];
156                 }
157             }
158 
159             if (symbols is p.symbols)
160                 productionsData[i] = p;
161             else
162             {
163                 auto p2 = new Production(p.nonterminalID, symbols);
164                 p2.productionID = cast(ProductionID) i;
165                 productionsData[i] = p2;
166             }
167         }
168         grammar2.productionsData = productionsData;
169 
170         matchGraph = new Graph!(TokenID[2], size_t);
171     }
172 
173     NonterminalID[ElementID] lookaheadNonterminals;
174     NonterminalID getLookaheadNonterminal(LRGraph graph, size_t state, size_t elementNr)
175     {
176         auto entry = ElementID(state, elementNr) in lookaheadNonterminals;
177         if (entry)
178             return *entry;
179         assert(!grammarFinished);
180 
181         auto node = graph.states[state];
182         auto element = node.elements[elementNr];
183 
184         NonterminalID l1 = grammar2.nonterminals.id(text("$l_", state, "_", elementNr));
185         lookaheadNonterminals[ElementID(state, elementNr)] = l1;
186 
187         if (element.dotPos == 0 && element.isStartElement)
188             grammar2.addProduction(new Production(l1,
189                     [SymbolInstance(grammar2.tokens.id("$end"))]));
190         if (element.dotPos == 0 && !element.isStartElement)
191         {
192             foreach (k2, element2; node.elements)
193             {
194                 bool elementPossible;
195                 foreach (n; element2.nextNonterminals(grammar, graph.globalOptions.directUnwrap))
196                 {
197                     if (n.nonterminalID == element.production.nonterminalID)
198                     {
199                         elementPossible = true;
200                     }
201                 }
202                 if (elementPossible)
203                 {
204                     auto afterNext = element2.afterNext(grammar);
205                     if (useRegexlookahead || element2.production.symbols.length == 1
206                             || element.production.productionID in reduceConflictProductions
207                             || element2.production.annotations.contains!"lookahead"
208                             || afterNext.length == 0)
209                     {
210                         NonterminalID l2 = getLookaheadNonterminal(graph, state, k2);
211                         grammar2.addProduction(new Production(l1,
212                                 afterNext ~ [immutable(SymbolInstance)(l2)]));
213                     }
214                     else if (afterNext.length > 0)
215                     {
216                         grammar2.addProduction(new Production(l1,
217                                 afterNext ~ [
218                                     immutable(SymbolInstance)(grammar2.tokens.id("$anything"))
219                                 ]));
220                     }
221                     else
222                     {
223                         grammar2.addProduction(new Production(l1,
224                                 [
225                                     SymbolInstance(grammar2.tokens.id("$anything"))
226                                 ]));
227                     }
228                 }
229             }
230         }
231 
232         foreach (prev; element.prevElements)
233         {
234             if (prev.state == size_t.max)
235             {
236                 continue;
237             }
238 
239             NonterminalID l2 = getLookaheadNonterminal(graph, prev.state, prev.elementNr);
240             grammar2.addProduction(new Production(l1, [SymbolInstance(l2)]));
241         }
242 
243         return l1;
244     }
245 
246     NonterminalID[ElementID] lookaheadNonterminalsSimple;
247     NonterminalID getLookaheadNonterminalSimple(LRGraph graph, size_t state, size_t elementNr)
248     {
249         auto entry = ElementID(state, elementNr) in lookaheadNonterminalsSimple;
250         if (entry)
251             return *entry;
252         assert(!grammarFinished);
253 
254         auto node = graph.states[state];
255         auto element = node.elements[elementNr];
256 
257         NonterminalID l1 = grammar2.nonterminals.id(text("$lsimple_", state, "_", elementNr));
258         lookaheadNonterminalsSimple[ElementID(state, elementNr)] = l1;
259 
260         foreach (t; element.lookahead.bitsSet)
261         {
262             immutable(SymbolInstance)[] dummySymbols = [
263                 immutable(SymbolInstance)(t)
264             ];
265             foreach (matchingTokens; grammar.matchingTokens)
266             {
267                 if (t == matchingTokens[0])
268                 {
269                     dummySymbols ~= immutable(SymbolInstance)(matchingTokens[1]);
270                     break;
271                 }
272             }
273             grammar2.addProduction(new Production(l1, dummySymbols));
274         }
275         return l1;
276     }
277 
278     bool isSimpleNonterminal(NonterminalID n)
279     {
280         foreach (p; grammar2.getProductions(n))
281         {
282             foreach (s; p.symbols)
283                 if (!s.isToken)
284                     return false;
285         }
286         return true;
287     }
288 
289     G[NonterminalID] nonterminalGraphCache;
290     G genNonterminalGraph(NonterminalID startNonterminal)
291     {
292         startNonterminal = normalizeNonterminalID(grammar2, startNonterminal);
293 
294         if (startNonterminal in nonterminalGraphCache)
295             return nonterminalGraphCache[startNonterminal];
296 
297         immutable endTok = grammar.tokens.getID("$end");
298 
299         G g = new G();
300         g.start = g.addNode("start");
301 
302         immutable(size_t)[] results = [0];
303         NodeID end = g.addNode("end");
304         NodeID recNode1 = g.addNode("r");
305         NodeID recNode2 = g.addNode("r");
306         g.get(end).results = results;
307         NodeID[2][NonterminalID] done;
308         NodeID[2] genSubgraph(NonterminalID nonterminal)
309         {
310             if (nonterminal in done)
311                 return done[nonterminal];
312             NodeID[2] nonterminalNodes = [
313                 g.addNode("start " ~ grammar2.getSymbolName(nonterminal)),
314                 g.addNode("end " ~ grammar2.getSymbolName(nonterminal))
315             ];
316             done[nonterminal] = nonterminalNodes;
317             scope (success)
318                 if (isSimpleNonterminal(nonterminal))
319                     done.remove(nonterminal);
320             foreach (p; grammar2.getProductions(nonterminal))
321             {
322                 NodeID current = nonterminalNodes[0];
323                 foreach (s; p.symbols)
324                 {
325                     NodeID n = g.addNode("x");
326                     if (s.isToken)
327                     {
328                         if (s.toTokenID in subGraphTokens)
329                         {
330                             g.addEdge(current, n, subGraphToken, EdgeFlags.none,
331                                     RegexLookaheadEdgeExtra([
332                                         subGraphTokens[s.toTokenID]
333                                     ]));
334                         }
335                         else
336                             g.addEdge(current, n, s);
337                     }
338                     else
339                     {
340                         auto subNodes = genSubgraph(s.toNonterminalID);
341                         g.addEdge(current, subNodes[0], Symbol.invalid);
342                         g.addEdge(subNodes[1], n, Symbol.invalid);
343                     }
344                     current = n;
345                 }
346                 g.addEdge(current, nonterminalNodes[1], Symbol.invalid);
347             }
348             return nonterminalNodes;
349         }
350 
351         auto nonterminalNodes = genSubgraph(startNonterminal);
352         g.addEdge(g.start, nonterminalNodes[0], Symbol.invalid);
353         g.addEdge(nonterminalNodes[1], end, Symbol.invalid);
354 
355         string symbolName(Symbol s)
356         {
357             return grammar2.getSymbolName(s);
358         }
359 
360         g = g.makeDeterministic!(typeof(g))(g.start).minimizeDFA;
361         nonterminalGraphCache[startNonterminal] = g;
362 
363         return g;
364     }
365 
366     Symbol[][][NonterminalID] nonterminalGraphSimpleCache;
367     Symbol[][] genNonterminalGraphSimple(NonterminalID startNonterminal)
368     {
369         startNonterminal = normalizeNonterminalID(grammar2, startNonterminal);
370 
371         if (startNonterminal in nonterminalGraphSimpleCache)
372             return nonterminalGraphSimpleCache[startNonterminal];
373 
374         Appender!(Symbol[][]) sequences;
375 
376         immutable endTok = grammar.tokens.getID("$end");
377 
378         bool[NonterminalID] done;
379         void genSubgraph(NonterminalID nonterminal)
380         {
381             if (nonterminal in done)
382                 return;
383             done[nonterminal] = true;
384             foreach (p; grammar2.getProductions(nonterminal))
385             {
386                 Symbol[] sequence;
387                 foreach (i, s; p.symbols)
388                 {
389                     if (s.isToken)
390                     {
391                         if (sequence.length)
392                         {
393                             bool subGraphNear;
394                             if (i)
395                             {
396                                 auto s2 = p.symbols[i - 1];
397                                 if (s2.isToken && s2.toTokenID in subGraphTokens)
398                                 {
399                                     subGraphNear = true;
400                                 }
401                             }
402                             foreach (matching; grammar.matchingTokens)
403                             {
404                                 if (s.toTokenID == matching[1])
405                                 {
406                                     subGraphNear = true;
407                                 }
408                             }
409                             foreach (j, s2; p.symbols[i .. $])
410                             {
411                                 if (!s2.isToken || j > 0)
412                                     break;
413                                 if (s2.toTokenID in subGraphTokens)
414                                 {
415                                     subGraphNear = true;
416                                     break;
417                                 }
418                             }
419                             if (!subGraphNear)
420                             {
421                                 sequences.put(sequence);
422                                 sequence = [];
423                             }
424                         }
425 
426                         sequence ~= s;
427                     }
428                     else
429                     {
430                         if (sequence.length)
431                         {
432                             sequences.put(sequence);
433                             sequence = [];
434                         }
435                         genSubgraph(s.toNonterminalID);
436                     }
437                 }
438                 if (sequence.length)
439                 {
440                     sequences.put(sequence);
441                     sequence = [];
442                 }
443             }
444         }
445 
446         genSubgraph(startNonterminal);
447 
448         nonterminalGraphSimpleCache[startNonterminal] = sequences.data;
449 
450         return sequences.data;
451     }
452 
453     static void clusterResults(G g)
454     {
455         immutable(size_t)[][] clusters;
456         size_t findCluster(size_t r)
457         {
458             foreach (i, c; clusters)
459             {
460                 if (c.canFind(r))
461                     return i;
462             }
463             return size_t.max;
464         }
465 
466         foreach (n; g.nodeIDs)
467         {
468             immutable(size_t)[] newResults;
469             foreach (r; g.get(n).results)
470             {
471                 size_t c = findCluster(r);
472                 if (c == size_t.max)
473                     newResults.addOnce(r);
474                 else
475                 {
476                     newResults.addOnce(clusters[c]);
477                     clusters = clusters[0 .. c] ~ clusters[c + 1 .. $];
478                 }
479             }
480             if (newResults.length)
481                 clusters ~= newResults;
482         }
483 
484         foreach (n; g.nodeIDs)
485         {
486             if (g.get(n).results.length)
487             {
488                 auto c = findCluster(g.get(n).results[0]);
489                 g.get(n).results = clusters[c];
490             }
491         }
492     }
493 
494     void genGraph(RegexLookaheadGraph regexLookaheadGraph)
495     {
496         regexLookaheadGraph.frozen = true;
497         G g = new G();
498         g.start = g.addNode("start");
499 
500         foreach (sequence; regexLookaheadGraph.sequences)
501         {
502             g.get(g.start).results.addOnce(sequence[1]);
503 
504             immutable(size_t)[] results = [sequence[1]];
505 
506             if (regexLookaheadGraph.isInnerGraph)
507             {
508                 bool[immutable(Symbol)[]] sequencesDone;
509                 foreach (j, s; sequence[0])
510                 {
511                     if (s.isToken)
512                     {
513                         g.addEdge(g.start, g.start, s);
514                     }
515                     else
516                     {
517                         foreach (sequence2; genNonterminalGraphSimple(s.toNonterminalID))
518                         {
519                             if (sequence2 in sequencesDone)
520                                 continue;
521                             sequencesDone[sequence2.idup] = true;
522                             NodeID current = g.start;
523                             foreach (s2; sequence2)
524                             {
525                                 NodeID n = g.addNode("x");
526                                 g.addEdge(current, n, s2);
527                                 current = n;
528                             }
529                             g.addEdge(current, g.start, Symbol.invalid);
530                         }
531                     }
532                 }
533                 g.get(g.start).results = results;
534             }
535             else
536             {
537                 NodeID current = g.start;
538 
539                 bool[immutable(Symbol)[]] firstSequencesDone;
540                 foreach (j, s; sequence[0])
541                 {
542                     NodeID end = g.addNode("x");
543                     g.get(end).results = results;
544                     if (s.isToken)
545                     {
546                         g.addEdge(current, end, s);
547                     }
548                     else
549                     {
550                         G g2 = genNonterminalGraph(s.toNonterminalID);
551 
552                         NodeID[NodeID] nodeMap;
553                         foreach (i; g2.nodeIDs)
554                         {
555                             NodeID n = g.addNode("x");
556                             nodeMap[i] = n;
557                             g.get(n).results = results;
558                             if (g2.get(i).results.length)
559                                 g.addEdge(n, end, Symbol.invalid);
560                         }
561                         foreach (i; g2.nodeIDs)
562                         {
563                             foreach (edge; g2.get(i).edges)
564                             {
565                                 g.addEdge(nodeMap[i], nodeMap[edge.next],
566                                         edge.symbol, EdgeFlags.none, edge.extra);
567                             }
568                         }
569                         g.addEdge(current, nodeMap[g2.start], Symbol.invalid);
570                     }
571                     current = end;
572                 }
573             }
574         }
575 
576         string symbolsName(G.Edge[] edges)
577         {
578             string r;
579             foreach (i, e; edges)
580             {
581                 if (i)
582                     r ~= "\n";
583                 r ~= grammar2.getSymbolName(e.symbol);
584                 foreach (sequence; e.extra.sequences)
585                 {
586                     r ~= "\n -";
587                     foreach (s; sequence)
588                         r ~= " " ~ grammar2.getSymbolName(s);
589                 }
590             }
591             return r;
592         }
593 
594         g = g.makeDeterministic!(typeof(g))(g.start, !regexLookaheadGraph.isInnerGraph);
595         g = g.minimizeDFA();
596 
597         if (regexLookaheadGraph.isInnerGraph)
598         {
599             clusterResults(g);
600             g = g.minimizeDFA();
601 
602             immutable size_t[] simpleResults = [0];
603             foreach (n; g.nodeIDs)
604             {
605                 if (g.get(n).results.length)
606                     g.get(n).results = simpleResults;
607             }
608             g = g.minimizeDFA();
609         }
610 
611         regexLookaheadGraph.dfa = g;
612 
613         foreach (n; g.nodeIDs)
614         {
615             foreach (e; g.getEdges(n))
616             {
617                 if (e.symbol.isToken && e.symbol.toTokenID in matchingTokensMap)
618                 {
619                     immutable(SymbolInstance)[][] sequences;
620                     TokenID[] endTokens;
621                     bool hasNonMatchingNext;
622                     foreach (e2; g.getEdges(e.next))
623                     {
624                         if (e2.symbol.isToken && e2.symbol.toTokenID == subGraphToken)
625                         {
626                             sequences.addOnce(e2.extra.sequences);
627                             foreach (e3; g.getEdges(e2.next))
628                             {
629                                 endTokens.addOnce(e3.symbol.toTokenID);
630                             }
631                         }
632                         else
633                             hasNonMatchingNext = true;
634                     }
635                     sequences.sort();
636                     enforce(!hasNonMatchingNext || sequences.length == 0);
637 
638                     if (sequences.length)
639                     {
640                         auto matchNode = buildMatchGraph(sequences);
641                         matchGraph.otherStarts ~= matchNode;
642                         matchGraphStartIndex[sequences.idup] = matchGraph.otherStarts.length - 1;
643                         foreach (matchEdge; matchGraph.getEdges(matchNode))
644                         {
645                             if (matchEdge.symbol[1] == TokenID.invalid)
646                                 enforce(!endTokens.canFind(matchEdge.symbol[0]));
647                         }
648                     }
649                 }
650             }
651         }
652     }
653 
654     Graph!(TokenID[2], size_t) matchGraph;
655     Graph!(TokenID[2], size_t).NodeID[immutable(SymbolInstance[])[]] matchGraphStates;
656     size_t[immutable(SymbolInstance[])[]] matchGraphStartIndex;
657 
658     Graph!(TokenID[2], size_t).NodeID buildMatchGraph(immutable(SymbolInstance)[][] sequences)
659     {
660         if (sequences in matchGraphStates)
661         {
662             return matchGraphStates[sequences];
663         }
664         else
665         {
666             string nodeText;
667             foreach (i, sequence; sequences)
668             {
669                 if (i)
670                     nodeText ~= "\n";
671                 foreach (s; sequence)
672                     nodeText ~= " " ~ grammar2.getSymbolName(s);
673             }
674 
675             auto n = matchGraph.addNode(nodeText);
676             matchGraphStates[sequences.idup] = n;
677 
678             bool[TokenID] usedAlone;
679             immutable(SymbolInstance)[][][TokenID[2]] usedMatching;
680 
681             bool[Symbol] done;
682             void genSubgraph(Symbol s)
683             {
684                 if (s in done)
685                     return;
686                 done[s] = true;
687                 if (s.isToken)
688                 {
689                     if (s.toTokenID in matchingTokensAll)
690                     {
691                         if (s.toTokenID in usedAlone)
692                             enforce(usedAlone[s.toTokenID]);
693                         else
694                         {
695                             usedAlone[s.toTokenID] = true;
696                             matchGraph.addEdge(n, n, [
697                                     s.toTokenID, TokenID.invalid
698                                     ]);
699                         }
700                     }
701                 }
702                 else
703                 {
704                     foreach (p; grammar2.getProductions(s.toNonterminalID))
705                     {
706                         for (size_t i = 0; i < p.symbols.length; i++)
707                         {
708                             if (i + 2 < p.symbols.length && p.symbols[i + 1].isToken
709                                     && p.symbols[i + 1].toTokenID in subGraphTokens)
710                             {
711                                 auto t1 = p.symbols[i].toTokenID;
712                                 auto t2 = p.symbols[i + 2].toTokenID;
713 
714                                 if (t1 in usedAlone)
715                                     enforce(!usedAlone[t1]);
716                                 else
717                                     usedAlone[t1] = false;
718 
719                                 if (t2 in usedAlone)
720                                     enforce(!usedAlone[t2]);
721                                 else
722                                     usedAlone[t2] = false;
723 
724                                 immutable(SymbolInstance)[][] sequences2 = usedMatching.get([
725                                     t1, t2
726                                 ], []);
727                                 sequences2.addOnce(subGraphTokens[p.symbols[i + 1].toTokenID]);
728                                 usedMatching[[t1, t2]] = sequences2;
729 
730                                 i += 2;
731                             }
732                             else
733                                 genSubgraph(p.symbols[i]);
734                         }
735                     }
736                 }
737             }
738 
739             foreach (sequence; sequences)
740             {
741                 foreach (s; sequence)
742                     genSubgraph(s);
743             }
744 
745             foreach (pair, sequences2; usedMatching)
746             {
747                 sequences2.sort();
748                 auto n2 = buildMatchGraph(sequences2);
749                 matchGraph.addEdge(n, n2, pair);
750             }
751 
752             return n;
753         }
754     }
755 
756     void genGraphs()
757     {
758         grammar2.calcNonterminalCanBeEmpty();
759         grammar2.fillProductionsForNonterminal();
760 
761         size_t lastLength = graphs.length;
762         for (size_t i = 0; i < graphs.length; i++)
763         {
764             if (i >= lastLength)
765             {
766                 lastLength = graphs.length;
767             }
768 
769             if (graphs[i] is null)
770                 continue;
771 
772             if (graphs[i].isInnerGraph)
773             {
774                 immutable(SymbolInstance)[][] sequences;
775                 foreach (sequence; graphs[i].sequences)
776                     sequences.addOnce(sequence[0]);
777                 bool[immutable(SymbolInstance)[]] sequenceMap;
778                 foreach (sequence; sequences)
779                     sequenceMap[sequence] = true;
780 
781                 bool changed = true;
782                 while (changed)
783                 {
784                     changed = false;
785 
786                     foreach (j; 0 .. graphs.length)
787                     {
788                         if (i == j)
789                             continue;
790                         if (graphs[j] is null || !graphs[j].isInnerGraph)
791                             continue;
792                         size_t commonSequences;
793                         foreach (sequence; graphs[j].sequences)
794                             if (sequence[0] in sequenceMap)
795                                 commonSequences++;
796                         if (commonSequences)
797                         {
798                             foreach (sequence; graphs[j].sequences)
799                             {
800                                 if (sequence[0]!in sequenceMap)
801                                 {
802                                     sequences ~= sequence[0];
803                                     sequenceMap[sequence[0]] = true;
804                                 }
805                             }
806                             graphs[j] = null;
807                             changed = true;
808                         }
809                     }
810                 }
811                 sequences.sort();
812                 graphs[i].sequences = [];
813                 foreach (k, sequence; sequences)
814                     graphs[i].sequences ~= tuple!(immutable(SymbolInstance)[], size_t)(sequence, k);
815             }
816 
817             genGraph(graphs[i]);
818         }
819 
820         matchGraph = matchGraph.minimizeDFA();
821 
822         string symbolPairName(TokenID[2] pair)
823         {
824             return grammar2.getSymbolName(pair[0]) ~ "..." ~ grammar2.getSymbolName(pair[1]);
825         }
826 
827         grammarFinished = true;
828     }
829 
830     void genMatchFuncs(ref CodeWriter code)
831     {
832         foreach (n; matchGraph.nodeIDs)
833         {
834             size_t[TokenID] actions;
835             foreach (m; grammar.matchingTokens)
836             {
837                 actions[m[1]] = size_t.max;
838             }
839             foreach (e; matchGraph.getEdges(n))
840             {
841                 if (e.symbol[1] == TokenID.invalid)
842                 {
843                     actions[e.symbol[0]] = size_t.max - 1;
844                 }
845                 else
846                 {
847                     actions[e.symbol[0]] = e.next.id;
848                 }
849             }
850 
851             mixin(genCode("code", q{
852                 int skipMatchingTokens$(n.id)(ref Lexer tmpLexer)
853                 {
854                     while (!tmpLexer.empty)
855                     {
856                         switch (tmpLexer.front.symbol)
857                         {
858                         $$foreach (t; actions.sortedKeys) {
859                             case Lexer.tokenID!$(tokenDCode(grammar.tokens[t])):
860                             $$if (actions[t] == size_t.max) {
861                                     return 0;
862                             $$} else if (actions[t] == size_t.max - 1) {
863                                     break;
864                             $$} else {
865                                     tmpLexer.popFront;
866                                     if (skipMatchingTokens$(actions[t])(tmpLexer) < 0)
867                                         return -1;
868                                     if (tmpLexer.empty)
869                                     {
870                                         lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);
871                                         return -1;
872                                     }
873                                     if ($(grammar.matchingTokens.filter!(m=>m[0] == t).map!(m=>text("tmpLexer.front.symbol != Lexer.tokenID!", tokenDCode(grammar.tokens[m[1]]))).joiner(" && ")))
874                                     {
875                                         lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);
876                                         return -1;
877                                     }
878                                     break;
879                             $$}
880                         $$}
881                         default:
882                             break;
883                         }
884                         tmpLexer.popFront;
885                     }
886                     lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);
887                     return -1;
888                 }
889             }));
890         }
891     }
892 
893     void genGraph(ref CodeWriter code, RegexLookaheadGraph regexLookaheadGraph)
894     {
895         if (regexLookaheadGraph is null)
896             return;
897         assert(grammarFinished);
898 
899         code.writeln("SymbolID checkRegexLookahead", regexLookaheadGraph.id, "()");
900         code.writeln("{").incIndent;
901 
902         foreach (sequence; regexLookaheadGraph.sequences)
903         {
904             code.write("// ", sequence[1], ":");
905             foreach (s; sequence[0])
906                 code.write(" ", grammar2.getSymbolName(s));
907             code.writeln();
908         }
909 
910         immutable endTok = grammar.tokens.getID("$end");
911         immutable anythingTok = grammar2.tokens.getID("$anything");
912 
913         auto dfa = regexLookaheadGraph.dfa;
914 
915         code.writeln("Lexer tmpLexer = *lexer;");
916         assert(dfa.get(dfa.start).results.length > 1);
917         code.writeln("goto state", dfa.start.id, "b;");
918 
919         foreach (n; dfa.nodeIDs)
920         {
921             code.writeln("state", n.id, ":");
922             code.incIndent;
923 
924             string resultsComment;
925             bool[size_t] resultDone;
926             string resultString(size_t res)
927             {
928                 return text(res);
929             }
930 
931             foreach (res; dfa.get(n).results)
932             {
933                 resultDone[res] = true;
934                 resultsComment ~= " " ~ resultString(res);
935             }
936             auto tmpReachableResults = reachableResults(dfa, n);
937             sort!((a, b) => a < b)(tmpReachableResults);
938             foreach (res; tmpReachableResults)
939             {
940                 if (res in resultDone)
941                     continue;
942                 resultsComment ~= " (" ~ resultString(res) ~ ")";
943             }
944             code.writeln("//", resultsComment);
945             if (dfa.get(n).results.length == 1 && dfa.get(n).results[0] != size_t.max)
946             {
947                 mixin(genCode("code", q{
948                     return $(dfa.get(n).results[0]);
949                 }));
950             }
951             else if (dfa.get(n).results.length == 1 && dfa.get(n).results[0] == size_t.max)
952             {
953                 code.writeln(q{    lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);});
954                 code.writeln(q{    return -1;});
955             }
956             else
957             {
958                 immutable(SymbolInstance)[][] sequences;
959                 foreach (e; dfa.getEdges(n))
960                 {
961                     if (e.symbol.isToken && e.symbol.toTokenID == subGraphToken)
962                     {
963                         sequences.addOnce(e.extra.sequences);
964                     }
965                 }
966 
967                 code.writeln("tmpLexer.popFront();");
968                 code.decIndent.writeln("state", n.id, "b:").incIndent;
969 
970                 if (sequences.length)
971                 {
972                     sequences.sort();
973                     auto matchNode = matchGraph.otherStarts[matchGraphStartIndex[sequences]];
974                     code.writeln("if (skipMatchingTokens", matchNode.id, "(tmpLexer) < 0)");
975                     code.writeln("    return SymbolID.max;");
976                     foreach (e; dfa.getEdges(n))
977                     {
978                         code.writeln("goto state", e.next.id, "b;");
979                     }
980                 }
981                 else
982                 {
983                     G.Edge[][] edges;
984                     size_t[G.NodeID] edgesMap;
985                     G.Edge edgeEmpty;
986                     bool hasAnythingEdge;
987                     foreach (e; dfa.getEdges(n))
988                     {
989                         if (e.symbol == endTok)
990                         {
991                             edgeEmpty = e;
992                             continue;
993                         }
994                         if (e.symbol == anythingTok)
995                         {
996                             hasAnythingEdge = true;
997                             continue;
998                         }
999                         if (e.next !in edgesMap)
1000                         {
1001                             edgesMap[e.next] = edges.length;
1002                             edges.length++;
1003                         }
1004                         edges[edgesMap[e.next]] ~= e;
1005                     }
1006                     sort(edges);
1007                     stdout.flush();
1008                     enforce(!hasAnythingEdge
1009                             || (edgeEmpty.next == G.NodeID.invalid && edges.length == 0));
1010 
1011                     size_t maxEdge = size_t.max;
1012                     size_t maxEdgeN = 0;
1013 
1014                     code.writeln("if (tmpLexer.empty)");
1015                     if (edgeEmpty.next == G.NodeID.invalid)
1016                     {
1017                         code.writeln("{");
1018                         code.writeln(q{    lastError = new SingleParseException!Location("EOF for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);});
1019                         code.writeln(q{    return SymbolID.max;});
1020                         code.writeln("}");
1021                     }
1022                     else
1023                         code.writeln("    goto state", edgeEmpty.next.id, ";");
1024 
1025                     foreach (i, e; edges)
1026                     {
1027                         if (i != maxEdge)
1028                         {
1029                             code.write("else ");
1030                             code.writeln("if (tmpLexer.front.symbol.among(", e.map!(x => text("Lexer.tokenID!",
1031                                     grammar.tokens[x.symbol.toTokenID].tokenDCode)).joiner(", "),
1032                                     "))");
1033                             code.writeln("    goto state", e[0].next.id, ";");
1034                         }
1035                     }
1036                     code.write("else //");
1037                     if (edgeEmpty.next != G.NodeID.invalid)
1038                     {
1039                         code.writeln();
1040                         code.writeln("    goto state", edgeEmpty.next.id, "; //like empty");
1041                     }
1042                     else if (maxEdge == size_t.max)
1043                     {
1044                         code.writeln();
1045                         code.writeln("{");
1046                         code.writeln(q{    lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);});
1047                         code.writeln(q{    return SymbolID.max;});
1048                         code.writeln("}");
1049                     }
1050                     else
1051                     {
1052                         foreach (x; edges[maxEdge])
1053                             code.write(" ", grammar.getSymbolName(x.symbol));
1054                         code.writeln();
1055                         code.writeln("    goto state", edges[maxEdge][0].next.id, ";");
1056                     }
1057                 }
1058             }
1059             code.decIndent;
1060         }
1061         if (dfa.nodes.length == 0)
1062         {
1063             code.writeln(q{lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);});
1064             code.writeln("return SymbolID.max;");
1065         }
1066         code.decIndent.writeln("}");
1067     }
1068 }