1 2 // Copyright Tim Schendekehl 2023. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // https://www.boost.org/LICENSE_1_0.txt) 6 7 module dparsergen.generator.regexlookahead; 8 import dparsergen.core.utils; 9 import dparsergen.generator.codewriter; 10 import dparsergen.generator.grammar; 11 import dparsergen.generator.nfa; 12 import dparsergen.generator.parser; 13 import dparsergen.generator.parsercodegencommon; 14 import dparsergen.generator.production; 15 import std.algorithm; 16 import std.conv; 17 import std.exception; 18 import std.range; 19 import std.stdio; 20 import std.typecons; 21 22 NonterminalID normalizeNonterminalID(EBNFGrammar grammar, NonterminalID startNonterminal) 23 { 24 NonterminalID origStartNonterminal = startNonterminal; 25 while (true) 26 { 27 auto productions = grammar.getProductions(startNonterminal); 28 if (productions.length != 1) 29 break; 30 if (productions[0].symbols.length != 1) 31 break; 32 if (productions[0].symbols[0].isToken) 33 break; 34 startNonterminal = productions[0].symbols[0].toNonterminalID; 35 if (startNonterminal == origStartNonterminal) 36 break; 37 } 38 39 return startNonterminal; 40 } 41 42 struct RegexLookaheadEdgeExtra 43 { 44 immutable(SymbolInstance)[][] sequences; 45 46 void opOpAssign(string op)(RegexLookaheadEdgeExtra rhs) if (op == "~") 47 { 48 sequences.addOnce(rhs.sequences); 49 } 50 } 51 52 class RegexLookaheadGraph 53 { 54 size_t id; 55 bool isInnerGraph; 56 Tuple!(immutable(SymbolInstance)[], size_t)[] sequences; 57 Graph!(Symbol, size_t, RegexLookaheadEdgeExtra) dfa; 58 bool frozen; 59 } 60 61 class RegexLookahead 62 { 63 alias G = Graph!(Symbol, size_t, RegexLookaheadEdgeExtra); 64 alias NodeID = G.NodeID; 65 EBNFGrammar grammar; 66 const bool[ProductionID] reduceConflictProductions; 67 EBNFGrammar grammar2; 68 bool useRegexlookahead; 69 70 RegexLookaheadGraph[] graphs; 71 RegexLookaheadGraph[immutable(SymbolInstance[])[]] innerGraphs; 72 73 bool grammarFinished; 74 75 immutable(SymbolInstance)[][TokenID] subGraphTokens; 76 TokenID subGraphToken; 77 78 TokenID[][TokenID] matchingTokensMap; 79 bool[TokenID] matchingTokensAll; 80 81 this(EBNFGrammar grammar, const bool[ProductionID] reduceConflictProductions, 82 bool useRegexlookahead) 83 { 84 this.grammar = grammar; 85 this.reduceConflictProductions = reduceConflictProductions; 86 this.useRegexlookahead = useRegexlookahead; 87 88 foreach (m; grammar.matchingTokens) 89 { 90 matchingTokensMap[m[0]] ~= m[1]; 91 matchingTokensAll[m[0]] = true; 92 matchingTokensAll[m[1]] = true; 93 } 94 95 grammar2 = new EBNFGrammar(null); 96 grammar2.nonterminals = grammar.nonterminals.dup; 97 grammar2.tokens = grammar.tokens.dup; 98 grammar2.productionsData = grammar.productionsData; 99 grammar2.matchingTokens = grammar.matchingTokens.dup; 100 101 subGraphToken = grammar2.tokens.id("$subGraph"); 102 grammar2.tokens.id("$anything"); 103 104 const(Production)*[] productionsData; 105 productionsData.length = grammar.productionsData.length; 106 foreach (i, p; grammar.productionsData) 107 { 108 if (p is null) 109 continue; 110 immutable(SymbolInstance)[] symbols = p.symbols; 111 for (size_t j = 0; j < symbols.length; j++) 112 { 113 if (j && symbols[j - 1].isToken) 114 { 115 size_t k = j; 116 while (k < symbols.length && !symbols[k].isToken) 117 k++; 118 if (k >= symbols.length) 119 break; 120 assert(symbols[k].isToken); 121 122 TokenID startToken = symbols[j - 1].toTokenID; 123 bool foundMatching; 124 for (; k < symbols.length; k++) 125 { 126 if (!symbols[k].isToken) 127 continue; 128 TokenID endToken = symbols[k].toTokenID; 129 foreach (matching; grammar.matchingTokens) 130 { 131 if (startToken == matching[0] && endToken == matching[1]) 132 foundMatching = true; 133 } 134 if (foundMatching) 135 break; 136 } 137 if (!foundMatching) 138 continue; 139 140 string subSymbolsName = "$subSymbols("; 141 immutable(SymbolInstance)[] subSymbols; 142 foreach (s; symbols[j .. k]) 143 { 144 if (!s.isToken) 145 subSymbols ~= SymbolInstance(normalizeNonterminalID(grammar, 146 s.symbol.toNonterminalID)); 147 else 148 subSymbols ~= SymbolInstance(s.symbol); 149 subSymbolsName ~= text(grammar.getSymbolName(s), ", "); 150 } 151 subSymbolsName ~= ")"; 152 153 TokenID t = grammar2.tokens.id(subSymbolsName); 154 subGraphTokens[t] = subSymbols; 155 symbols = symbols[0 .. j] ~ SymbolInstance(t) ~ symbols[k .. $]; 156 } 157 } 158 159 if (symbols is p.symbols) 160 productionsData[i] = p; 161 else 162 { 163 auto p2 = new Production(p.nonterminalID, symbols); 164 p2.productionID = cast(ProductionID) i; 165 productionsData[i] = p2; 166 } 167 } 168 grammar2.productionsData = productionsData; 169 170 matchGraph = new Graph!(TokenID[2], size_t); 171 } 172 173 NonterminalID[ElementID] lookaheadNonterminals; 174 NonterminalID getLookaheadNonterminal(LRGraph graph, size_t state, size_t elementNr) 175 { 176 auto entry = ElementID(state, elementNr) in lookaheadNonterminals; 177 if (entry) 178 return *entry; 179 assert(!grammarFinished); 180 181 auto node = graph.states[state]; 182 auto element = node.elements[elementNr]; 183 184 NonterminalID l1 = grammar2.nonterminals.id(text("$l_", state, "_", elementNr)); 185 lookaheadNonterminals[ElementID(state, elementNr)] = l1; 186 187 if (element.dotPos == 0 && element.isStartElement) 188 grammar2.addProduction(new Production(l1, 189 [SymbolInstance(grammar2.tokens.id("$end"))])); 190 if (element.dotPos == 0 && !element.isStartElement) 191 { 192 foreach (k2, element2; node.elements) 193 { 194 bool elementPossible; 195 foreach (n; element2.nextNonterminals(grammar, graph.globalOptions.directUnwrap)) 196 { 197 if (n.nonterminalID == element.production.nonterminalID) 198 { 199 elementPossible = true; 200 } 201 } 202 if (elementPossible) 203 { 204 auto afterNext = element2.afterNext(grammar); 205 if (useRegexlookahead || element2.production.symbols.length == 1 206 || element.production.productionID in reduceConflictProductions 207 || element2.production.annotations.contains!"lookahead" 208 || afterNext.length == 0) 209 { 210 NonterminalID l2 = getLookaheadNonterminal(graph, state, k2); 211 grammar2.addProduction(new Production(l1, 212 afterNext ~ [immutable(SymbolInstance)(l2)])); 213 } 214 else if (afterNext.length > 0) 215 { 216 grammar2.addProduction(new Production(l1, 217 afterNext ~ [ 218 immutable(SymbolInstance)(grammar2.tokens.id("$anything")) 219 ])); 220 } 221 else 222 { 223 grammar2.addProduction(new Production(l1, 224 [ 225 SymbolInstance(grammar2.tokens.id("$anything")) 226 ])); 227 } 228 } 229 } 230 } 231 232 foreach (prev; element.prevElements) 233 { 234 if (prev.state == size_t.max) 235 { 236 continue; 237 } 238 239 NonterminalID l2 = getLookaheadNonterminal(graph, prev.state, prev.elementNr); 240 grammar2.addProduction(new Production(l1, [SymbolInstance(l2)])); 241 } 242 243 return l1; 244 } 245 246 NonterminalID[ElementID] lookaheadNonterminalsSimple; 247 NonterminalID getLookaheadNonterminalSimple(LRGraph graph, size_t state, size_t elementNr) 248 { 249 auto entry = ElementID(state, elementNr) in lookaheadNonterminalsSimple; 250 if (entry) 251 return *entry; 252 assert(!grammarFinished); 253 254 auto node = graph.states[state]; 255 auto element = node.elements[elementNr]; 256 257 NonterminalID l1 = grammar2.nonterminals.id(text("$lsimple_", state, "_", elementNr)); 258 lookaheadNonterminalsSimple[ElementID(state, elementNr)] = l1; 259 260 foreach (t; element.lookahead.bitsSet) 261 { 262 immutable(SymbolInstance)[] dummySymbols = [ 263 immutable(SymbolInstance)(t) 264 ]; 265 foreach (matchingTokens; grammar.matchingTokens) 266 { 267 if (t == matchingTokens[0]) 268 { 269 dummySymbols ~= immutable(SymbolInstance)(matchingTokens[1]); 270 break; 271 } 272 } 273 grammar2.addProduction(new Production(l1, dummySymbols)); 274 } 275 return l1; 276 } 277 278 bool isSimpleNonterminal(NonterminalID n) 279 { 280 foreach (p; grammar2.getProductions(n)) 281 { 282 foreach (s; p.symbols) 283 if (!s.isToken) 284 return false; 285 } 286 return true; 287 } 288 289 G[NonterminalID] nonterminalGraphCache; 290 G genNonterminalGraph(NonterminalID startNonterminal) 291 { 292 startNonterminal = normalizeNonterminalID(grammar2, startNonterminal); 293 294 if (startNonterminal in nonterminalGraphCache) 295 return nonterminalGraphCache[startNonterminal]; 296 297 immutable endTok = grammar.tokens.getID("$end"); 298 299 G g = new G(); 300 g.start = g.addNode("start"); 301 302 immutable(size_t)[] results = [0]; 303 NodeID end = g.addNode("end"); 304 NodeID recNode1 = g.addNode("r"); 305 NodeID recNode2 = g.addNode("r"); 306 g.get(end).results = results; 307 NodeID[2][NonterminalID] done; 308 NodeID[2] genSubgraph(NonterminalID nonterminal) 309 { 310 if (nonterminal in done) 311 return done[nonterminal]; 312 NodeID[2] nonterminalNodes = [ 313 g.addNode("start " ~ grammar2.getSymbolName(nonterminal)), 314 g.addNode("end " ~ grammar2.getSymbolName(nonterminal)) 315 ]; 316 done[nonterminal] = nonterminalNodes; 317 scope (success) 318 if (isSimpleNonterminal(nonterminal)) 319 done.remove(nonterminal); 320 foreach (p; grammar2.getProductions(nonterminal)) 321 { 322 NodeID current = nonterminalNodes[0]; 323 foreach (s; p.symbols) 324 { 325 NodeID n = g.addNode("x"); 326 if (s.isToken) 327 { 328 if (s.toTokenID in subGraphTokens) 329 { 330 g.addEdge(current, n, subGraphToken, EdgeFlags.none, 331 RegexLookaheadEdgeExtra([ 332 subGraphTokens[s.toTokenID] 333 ])); 334 } 335 else 336 g.addEdge(current, n, s); 337 } 338 else 339 { 340 auto subNodes = genSubgraph(s.toNonterminalID); 341 g.addEdge(current, subNodes[0], Symbol.invalid); 342 g.addEdge(subNodes[1], n, Symbol.invalid); 343 } 344 current = n; 345 } 346 g.addEdge(current, nonterminalNodes[1], Symbol.invalid); 347 } 348 return nonterminalNodes; 349 } 350 351 auto nonterminalNodes = genSubgraph(startNonterminal); 352 g.addEdge(g.start, nonterminalNodes[0], Symbol.invalid); 353 g.addEdge(nonterminalNodes[1], end, Symbol.invalid); 354 355 string symbolName(Symbol s) 356 { 357 return grammar2.getSymbolName(s); 358 } 359 360 g = g.makeDeterministic!(typeof(g))(g.start).minimizeDFA; 361 nonterminalGraphCache[startNonterminal] = g; 362 363 return g; 364 } 365 366 Symbol[][][NonterminalID] nonterminalGraphSimpleCache; 367 Symbol[][] genNonterminalGraphSimple(NonterminalID startNonterminal) 368 { 369 startNonterminal = normalizeNonterminalID(grammar2, startNonterminal); 370 371 if (startNonterminal in nonterminalGraphSimpleCache) 372 return nonterminalGraphSimpleCache[startNonterminal]; 373 374 Appender!(Symbol[][]) sequences; 375 376 immutable endTok = grammar.tokens.getID("$end"); 377 378 bool[NonterminalID] done; 379 void genSubgraph(NonterminalID nonterminal) 380 { 381 if (nonterminal in done) 382 return; 383 done[nonterminal] = true; 384 foreach (p; grammar2.getProductions(nonterminal)) 385 { 386 Symbol[] sequence; 387 foreach (i, s; p.symbols) 388 { 389 if (s.isToken) 390 { 391 if (sequence.length) 392 { 393 bool subGraphNear; 394 if (i) 395 { 396 auto s2 = p.symbols[i - 1]; 397 if (s2.isToken && s2.toTokenID in subGraphTokens) 398 { 399 subGraphNear = true; 400 } 401 } 402 foreach (matching; grammar.matchingTokens) 403 { 404 if (s.toTokenID == matching[1]) 405 { 406 subGraphNear = true; 407 } 408 } 409 foreach (j, s2; p.symbols[i .. $]) 410 { 411 if (!s2.isToken || j > 0) 412 break; 413 if (s2.toTokenID in subGraphTokens) 414 { 415 subGraphNear = true; 416 break; 417 } 418 } 419 if (!subGraphNear) 420 { 421 sequences.put(sequence); 422 sequence = []; 423 } 424 } 425 426 sequence ~= s; 427 } 428 else 429 { 430 if (sequence.length) 431 { 432 sequences.put(sequence); 433 sequence = []; 434 } 435 genSubgraph(s.toNonterminalID); 436 } 437 } 438 if (sequence.length) 439 { 440 sequences.put(sequence); 441 sequence = []; 442 } 443 } 444 } 445 446 genSubgraph(startNonterminal); 447 448 nonterminalGraphSimpleCache[startNonterminal] = sequences.data; 449 450 return sequences.data; 451 } 452 453 static void clusterResults(G g) 454 { 455 immutable(size_t)[][] clusters; 456 size_t findCluster(size_t r) 457 { 458 foreach (i, c; clusters) 459 { 460 if (c.canFind(r)) 461 return i; 462 } 463 return size_t.max; 464 } 465 466 foreach (n; g.nodeIDs) 467 { 468 immutable(size_t)[] newResults; 469 foreach (r; g.get(n).results) 470 { 471 size_t c = findCluster(r); 472 if (c == size_t.max) 473 newResults.addOnce(r); 474 else 475 { 476 newResults.addOnce(clusters[c]); 477 clusters = clusters[0 .. c] ~ clusters[c + 1 .. $]; 478 } 479 } 480 if (newResults.length) 481 clusters ~= newResults; 482 } 483 484 foreach (n; g.nodeIDs) 485 { 486 if (g.get(n).results.length) 487 { 488 auto c = findCluster(g.get(n).results[0]); 489 g.get(n).results = clusters[c]; 490 } 491 } 492 } 493 494 void genGraph(RegexLookaheadGraph regexLookaheadGraph) 495 { 496 regexLookaheadGraph.frozen = true; 497 G g = new G(); 498 g.start = g.addNode("start"); 499 500 foreach (sequence; regexLookaheadGraph.sequences) 501 { 502 g.get(g.start).results.addOnce(sequence[1]); 503 504 immutable(size_t)[] results = [sequence[1]]; 505 506 if (regexLookaheadGraph.isInnerGraph) 507 { 508 bool[immutable(Symbol)[]] sequencesDone; 509 foreach (j, s; sequence[0]) 510 { 511 if (s.isToken) 512 { 513 g.addEdge(g.start, g.start, s); 514 } 515 else 516 { 517 foreach (sequence2; genNonterminalGraphSimple(s.toNonterminalID)) 518 { 519 if (sequence2 in sequencesDone) 520 continue; 521 sequencesDone[sequence2.idup] = true; 522 NodeID current = g.start; 523 foreach (s2; sequence2) 524 { 525 NodeID n = g.addNode("x"); 526 g.addEdge(current, n, s2); 527 current = n; 528 } 529 g.addEdge(current, g.start, Symbol.invalid); 530 } 531 } 532 } 533 g.get(g.start).results = results; 534 } 535 else 536 { 537 NodeID current = g.start; 538 539 bool[immutable(Symbol)[]] firstSequencesDone; 540 foreach (j, s; sequence[0]) 541 { 542 NodeID end = g.addNode("x"); 543 g.get(end).results = results; 544 if (s.isToken) 545 { 546 g.addEdge(current, end, s); 547 } 548 else 549 { 550 G g2 = genNonterminalGraph(s.toNonterminalID); 551 552 NodeID[NodeID] nodeMap; 553 foreach (i; g2.nodeIDs) 554 { 555 NodeID n = g.addNode("x"); 556 nodeMap[i] = n; 557 g.get(n).results = results; 558 if (g2.get(i).results.length) 559 g.addEdge(n, end, Symbol.invalid); 560 } 561 foreach (i; g2.nodeIDs) 562 { 563 foreach (edge; g2.get(i).edges) 564 { 565 g.addEdge(nodeMap[i], nodeMap[edge.next], 566 edge.symbol, EdgeFlags.none, edge.extra); 567 } 568 } 569 g.addEdge(current, nodeMap[g2.start], Symbol.invalid); 570 } 571 current = end; 572 } 573 } 574 } 575 576 string symbolsName(G.Edge[] edges) 577 { 578 string r; 579 foreach (i, e; edges) 580 { 581 if (i) 582 r ~= "\n"; 583 r ~= grammar2.getSymbolName(e.symbol); 584 foreach (sequence; e.extra.sequences) 585 { 586 r ~= "\n -"; 587 foreach (s; sequence) 588 r ~= " " ~ grammar2.getSymbolName(s); 589 } 590 } 591 return r; 592 } 593 594 g = g.makeDeterministic!(typeof(g))(g.start, !regexLookaheadGraph.isInnerGraph); 595 g = g.minimizeDFA(); 596 597 if (regexLookaheadGraph.isInnerGraph) 598 { 599 clusterResults(g); 600 g = g.minimizeDFA(); 601 602 immutable size_t[] simpleResults = [0]; 603 foreach (n; g.nodeIDs) 604 { 605 if (g.get(n).results.length) 606 g.get(n).results = simpleResults; 607 } 608 g = g.minimizeDFA(); 609 } 610 611 regexLookaheadGraph.dfa = g; 612 613 foreach (n; g.nodeIDs) 614 { 615 foreach (e; g.getEdges(n)) 616 { 617 if (e.symbol.isToken && e.symbol.toTokenID in matchingTokensMap) 618 { 619 immutable(SymbolInstance)[][] sequences; 620 TokenID[] endTokens; 621 bool hasNonMatchingNext; 622 foreach (e2; g.getEdges(e.next)) 623 { 624 if (e2.symbol.isToken && e2.symbol.toTokenID == subGraphToken) 625 { 626 sequences.addOnce(e2.extra.sequences); 627 foreach (e3; g.getEdges(e2.next)) 628 { 629 endTokens.addOnce(e3.symbol.toTokenID); 630 } 631 } 632 else 633 hasNonMatchingNext = true; 634 } 635 sequences.sort(); 636 enforce(!hasNonMatchingNext || sequences.length == 0); 637 638 if (sequences.length) 639 { 640 auto matchNode = buildMatchGraph(sequences); 641 matchGraph.otherStarts ~= matchNode; 642 matchGraphStartIndex[sequences.idup] = matchGraph.otherStarts.length - 1; 643 foreach (matchEdge; matchGraph.getEdges(matchNode)) 644 { 645 if (matchEdge.symbol[1] == TokenID.invalid) 646 enforce(!endTokens.canFind(matchEdge.symbol[0])); 647 } 648 } 649 } 650 } 651 } 652 } 653 654 Graph!(TokenID[2], size_t) matchGraph; 655 Graph!(TokenID[2], size_t).NodeID[immutable(SymbolInstance[])[]] matchGraphStates; 656 size_t[immutable(SymbolInstance[])[]] matchGraphStartIndex; 657 658 Graph!(TokenID[2], size_t).NodeID buildMatchGraph(immutable(SymbolInstance)[][] sequences) 659 { 660 if (sequences in matchGraphStates) 661 { 662 return matchGraphStates[sequences]; 663 } 664 else 665 { 666 string nodeText; 667 foreach (i, sequence; sequences) 668 { 669 if (i) 670 nodeText ~= "\n"; 671 foreach (s; sequence) 672 nodeText ~= " " ~ grammar2.getSymbolName(s); 673 } 674 675 auto n = matchGraph.addNode(nodeText); 676 matchGraphStates[sequences.idup] = n; 677 678 bool[TokenID] usedAlone; 679 immutable(SymbolInstance)[][][TokenID[2]] usedMatching; 680 681 bool[Symbol] done; 682 void genSubgraph(Symbol s) 683 { 684 if (s in done) 685 return; 686 done[s] = true; 687 if (s.isToken) 688 { 689 if (s.toTokenID in matchingTokensAll) 690 { 691 if (s.toTokenID in usedAlone) 692 enforce(usedAlone[s.toTokenID]); 693 else 694 { 695 usedAlone[s.toTokenID] = true; 696 matchGraph.addEdge(n, n, [ 697 s.toTokenID, TokenID.invalid 698 ]); 699 } 700 } 701 } 702 else 703 { 704 foreach (p; grammar2.getProductions(s.toNonterminalID)) 705 { 706 for (size_t i = 0; i < p.symbols.length; i++) 707 { 708 if (i + 2 < p.symbols.length && p.symbols[i + 1].isToken 709 && p.symbols[i + 1].toTokenID in subGraphTokens) 710 { 711 auto t1 = p.symbols[i].toTokenID; 712 auto t2 = p.symbols[i + 2].toTokenID; 713 714 if (t1 in usedAlone) 715 enforce(!usedAlone[t1]); 716 else 717 usedAlone[t1] = false; 718 719 if (t2 in usedAlone) 720 enforce(!usedAlone[t2]); 721 else 722 usedAlone[t2] = false; 723 724 immutable(SymbolInstance)[][] sequences2 = usedMatching.get([ 725 t1, t2 726 ], []); 727 sequences2.addOnce(subGraphTokens[p.symbols[i + 1].toTokenID]); 728 usedMatching[[t1, t2]] = sequences2; 729 730 i += 2; 731 } 732 else 733 genSubgraph(p.symbols[i]); 734 } 735 } 736 } 737 } 738 739 foreach (sequence; sequences) 740 { 741 foreach (s; sequence) 742 genSubgraph(s); 743 } 744 745 foreach (pair, sequences2; usedMatching) 746 { 747 sequences2.sort(); 748 auto n2 = buildMatchGraph(sequences2); 749 matchGraph.addEdge(n, n2, pair); 750 } 751 752 return n; 753 } 754 } 755 756 void genGraphs() 757 { 758 grammar2.calcNonterminalCanBeEmpty(); 759 grammar2.fillProductionsForNonterminal(); 760 761 size_t lastLength = graphs.length; 762 for (size_t i = 0; i < graphs.length; i++) 763 { 764 if (i >= lastLength) 765 { 766 lastLength = graphs.length; 767 } 768 769 if (graphs[i] is null) 770 continue; 771 772 if (graphs[i].isInnerGraph) 773 { 774 immutable(SymbolInstance)[][] sequences; 775 foreach (sequence; graphs[i].sequences) 776 sequences.addOnce(sequence[0]); 777 bool[immutable(SymbolInstance)[]] sequenceMap; 778 foreach (sequence; sequences) 779 sequenceMap[sequence] = true; 780 781 bool changed = true; 782 while (changed) 783 { 784 changed = false; 785 786 foreach (j; 0 .. graphs.length) 787 { 788 if (i == j) 789 continue; 790 if (graphs[j] is null || !graphs[j].isInnerGraph) 791 continue; 792 size_t commonSequences; 793 foreach (sequence; graphs[j].sequences) 794 if (sequence[0] in sequenceMap) 795 commonSequences++; 796 if (commonSequences) 797 { 798 foreach (sequence; graphs[j].sequences) 799 { 800 if (sequence[0]!in sequenceMap) 801 { 802 sequences ~= sequence[0]; 803 sequenceMap[sequence[0]] = true; 804 } 805 } 806 graphs[j] = null; 807 changed = true; 808 } 809 } 810 } 811 sequences.sort(); 812 graphs[i].sequences = []; 813 foreach (k, sequence; sequences) 814 graphs[i].sequences ~= tuple!(immutable(SymbolInstance)[], size_t)(sequence, k); 815 } 816 817 genGraph(graphs[i]); 818 } 819 820 matchGraph = matchGraph.minimizeDFA(); 821 822 string symbolPairName(TokenID[2] pair) 823 { 824 return grammar2.getSymbolName(pair[0]) ~ "..." ~ grammar2.getSymbolName(pair[1]); 825 } 826 827 grammarFinished = true; 828 } 829 830 void genMatchFuncs(ref CodeWriter code) 831 { 832 foreach (n; matchGraph.nodeIDs) 833 { 834 size_t[TokenID] actions; 835 foreach (m; grammar.matchingTokens) 836 { 837 actions[m[1]] = size_t.max; 838 } 839 foreach (e; matchGraph.getEdges(n)) 840 { 841 if (e.symbol[1] == TokenID.invalid) 842 { 843 actions[e.symbol[0]] = size_t.max - 1; 844 } 845 else 846 { 847 actions[e.symbol[0]] = e.next.id; 848 } 849 } 850 851 mixin(genCode("code", q{ 852 int skipMatchingTokens$(n.id)(ref Lexer tmpLexer) 853 { 854 while (!tmpLexer.empty) 855 { 856 switch (tmpLexer.front.symbol) 857 { 858 $$foreach (t; actions.sortedKeys) { 859 case Lexer.tokenID!$(tokenDCode(grammar.tokens[t])): 860 $$if (actions[t] == size_t.max) { 861 return 0; 862 $$} else if (actions[t] == size_t.max - 1) { 863 break; 864 $$} else { 865 tmpLexer.popFront; 866 if (skipMatchingTokens$(actions[t])(tmpLexer) < 0) 867 return -1; 868 if (tmpLexer.empty) 869 { 870 lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation); 871 return -1; 872 } 873 if ($(grammar.matchingTokens.filter!(m=>m[0] == t).map!(m=>text("tmpLexer.front.symbol != Lexer.tokenID!", tokenDCode(grammar.tokens[m[1]]))).joiner(" && "))) 874 { 875 lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation); 876 return -1; 877 } 878 break; 879 $$} 880 $$} 881 default: 882 break; 883 } 884 tmpLexer.popFront; 885 } 886 lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation); 887 return -1; 888 } 889 })); 890 } 891 } 892 893 void genGraph(ref CodeWriter code, RegexLookaheadGraph regexLookaheadGraph) 894 { 895 if (regexLookaheadGraph is null) 896 return; 897 assert(grammarFinished); 898 899 code.writeln("SymbolID checkRegexLookahead", regexLookaheadGraph.id, "()"); 900 code.writeln("{").incIndent; 901 902 foreach (sequence; regexLookaheadGraph.sequences) 903 { 904 code.write("// ", sequence[1], ":"); 905 foreach (s; sequence[0]) 906 code.write(" ", grammar2.getSymbolName(s)); 907 code.writeln(); 908 } 909 910 immutable endTok = grammar.tokens.getID("$end"); 911 immutable anythingTok = grammar2.tokens.getID("$anything"); 912 913 auto dfa = regexLookaheadGraph.dfa; 914 915 code.writeln("Lexer tmpLexer = *lexer;"); 916 assert(dfa.get(dfa.start).results.length > 1); 917 code.writeln("goto state", dfa.start.id, "b;"); 918 919 foreach (n; dfa.nodeIDs) 920 { 921 code.writeln("state", n.id, ":"); 922 code.incIndent; 923 924 string resultsComment; 925 bool[size_t] resultDone; 926 string resultString(size_t res) 927 { 928 return text(res); 929 } 930 931 foreach (res; dfa.get(n).results) 932 { 933 resultDone[res] = true; 934 resultsComment ~= " " ~ resultString(res); 935 } 936 auto tmpReachableResults = reachableResults(dfa, n); 937 sort!((a, b) => a < b)(tmpReachableResults); 938 foreach (res; tmpReachableResults) 939 { 940 if (res in resultDone) 941 continue; 942 resultsComment ~= " (" ~ resultString(res) ~ ")"; 943 } 944 code.writeln("//", resultsComment); 945 if (dfa.get(n).results.length == 1 && dfa.get(n).results[0] != size_t.max) 946 { 947 mixin(genCode("code", q{ 948 return $(dfa.get(n).results[0]); 949 })); 950 } 951 else if (dfa.get(n).results.length == 1 && dfa.get(n).results[0] == size_t.max) 952 { 953 code.writeln(q{ lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);}); 954 code.writeln(q{ return -1;}); 955 } 956 else 957 { 958 immutable(SymbolInstance)[][] sequences; 959 foreach (e; dfa.getEdges(n)) 960 { 961 if (e.symbol.isToken && e.symbol.toTokenID == subGraphToken) 962 { 963 sequences.addOnce(e.extra.sequences); 964 } 965 } 966 967 code.writeln("tmpLexer.popFront();"); 968 code.decIndent.writeln("state", n.id, "b:").incIndent; 969 970 if (sequences.length) 971 { 972 sequences.sort(); 973 auto matchNode = matchGraph.otherStarts[matchGraphStartIndex[sequences]]; 974 code.writeln("if (skipMatchingTokens", matchNode.id, "(tmpLexer) < 0)"); 975 code.writeln(" return SymbolID.max;"); 976 foreach (e; dfa.getEdges(n)) 977 { 978 code.writeln("goto state", e.next.id, "b;"); 979 } 980 } 981 else 982 { 983 G.Edge[][] edges; 984 size_t[G.NodeID] edgesMap; 985 G.Edge edgeEmpty; 986 bool hasAnythingEdge; 987 foreach (e; dfa.getEdges(n)) 988 { 989 if (e.symbol == endTok) 990 { 991 edgeEmpty = e; 992 continue; 993 } 994 if (e.symbol == anythingTok) 995 { 996 hasAnythingEdge = true; 997 continue; 998 } 999 if (e.next !in edgesMap) 1000 { 1001 edgesMap[e.next] = edges.length; 1002 edges.length++; 1003 } 1004 edges[edgesMap[e.next]] ~= e; 1005 } 1006 sort(edges); 1007 stdout.flush(); 1008 enforce(!hasAnythingEdge 1009 || (edgeEmpty.next == G.NodeID.invalid && edges.length == 0)); 1010 1011 size_t maxEdge = size_t.max; 1012 size_t maxEdgeN = 0; 1013 1014 code.writeln("if (tmpLexer.empty)"); 1015 if (edgeEmpty.next == G.NodeID.invalid) 1016 { 1017 code.writeln("{"); 1018 code.writeln(q{ lastError = new SingleParseException!Location("EOF for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);}); 1019 code.writeln(q{ return SymbolID.max;}); 1020 code.writeln("}"); 1021 } 1022 else 1023 code.writeln(" goto state", edgeEmpty.next.id, ";"); 1024 1025 foreach (i, e; edges) 1026 { 1027 if (i != maxEdge) 1028 { 1029 code.write("else "); 1030 code.writeln("if (tmpLexer.front.symbol.among(", e.map!(x => text("Lexer.tokenID!", 1031 grammar.tokens[x.symbol.toTokenID].tokenDCode)).joiner(", "), 1032 "))"); 1033 code.writeln(" goto state", e[0].next.id, ";"); 1034 } 1035 } 1036 code.write("else //"); 1037 if (edgeEmpty.next != G.NodeID.invalid) 1038 { 1039 code.writeln(); 1040 code.writeln(" goto state", edgeEmpty.next.id, "; //like empty"); 1041 } 1042 else if (maxEdge == size_t.max) 1043 { 1044 code.writeln(); 1045 code.writeln("{"); 1046 code.writeln(q{ lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);}); 1047 code.writeln(q{ return SymbolID.max;}); 1048 code.writeln("}"); 1049 } 1050 else 1051 { 1052 foreach (x; edges[maxEdge]) 1053 code.write(" ", grammar.getSymbolName(x.symbol)); 1054 code.writeln(); 1055 code.writeln(" goto state", edges[maxEdge][0].next.id, ";"); 1056 } 1057 } 1058 } 1059 code.decIndent; 1060 } 1061 if (dfa.nodes.length == 0) 1062 { 1063 code.writeln(q{lastError = new SingleParseException!Location("Error for lookahead", lexer.front.currentLocation, tmpLexer.front.currentLocation);}); 1064 code.writeln("return SymbolID.max;"); 1065 } 1066 code.decIndent.writeln("}"); 1067 } 1068 }