1 2 // Copyright Tim Schendekehl 2023. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // https://www.boost.org/LICENSE_1_0.txt) 6 7 module dparsergen.generator.lexergenerator; 8 import dparsergen.core.utils; 9 import dparsergen.generator.codewriter; 10 import dparsergen.generator.grammar; 11 import dparsergen.generator.nfa; 12 import dparsergen.generator.parsercodegencommon; 13 import dparsergen.generator.production; 14 import std.algorithm; 15 import std.conv; 16 import std.exception; 17 import std.range; 18 import std.stdio; 19 import std.uni; 20 import std.utf; 21 22 struct LexerAction 23 { 24 NonterminalID nonterminal = NonterminalID.invalid; 25 bool isIgnoreToken; 26 bool isNegLookahead; 27 } 28 29 struct DCharToken 30 { 31 CodepointSet* codepoints; 32 EdgeFlags specialFlags; 33 immutable(SymbolInstance)[] recursiveSequence; 34 enum invalid = DCharToken(); 35 string toString() const 36 { 37 if (codepoints) 38 return codepointSetToStr(*cast(CodepointSet*) codepoints); 39 if (recursiveSequence.length) 40 return "RecursiveLexer"; 41 return text(specialFlags); 42 } 43 } 44 45 string codepointSetToStr(CodepointSet set) 46 { 47 string r = "["; 48 49 if (0x10ffff in set) 50 { 51 r ~= "^"; 52 set = set.inverted; 53 } 54 55 foreach (interval; set.byInterval) 56 { 57 if (interval[0] + 1 == interval[1]) 58 r ~= interval[0].escapeCodePoint(true); 59 else 60 r ~= text(interval[0].escapeCodePoint(true), "-", 61 escapeCodePoint(interval[1] - 1, true)); 62 } 63 r ~= "]"; 64 return r; 65 } 66 67 CodepointSet restrictToAssumedCodepoints(CodepointSet set, CodepointSet assumed) 68 { 69 set = set & assumed; 70 foreach (interval; assumed.inverted.byInterval) 71 { 72 if ((interval[0] != 0 && interval[0] - 1 in set) && interval[1] in set) 73 set = set | CodepointSet(interval[0], interval[1]); 74 } 75 return set; 76 } 77 78 string codepointSetToCode(string varCode, CodepointSet set, 79 CodepointSet assumed = CodepointSet().inverted) 80 { 81 bool inverted; 82 if (0x10ffff in set) 83 { 84 inverted = true; 85 set = set.inverted; 86 } 87 set = restrictToAssumedCodepoints(set, assumed); 88 return codepointSetToCode(varCode, set, inverted); 89 } 90 91 string codepointSetToCode(string varCode, CodepointSet set, bool inverted) 92 { 93 if (set.empty) 94 { 95 return inverted.to!string; 96 } 97 98 string r; 99 100 if (set.byInterval.length == 1 && set.byInterval.front[0] + 1 == set.byInterval.front[1]) 101 { 102 r ~= varCode; 103 r ~= (inverted) ? " != " : " == "; 104 r ~= text("'", set.byInterval.front[0].escapeCodePoint(false), "'"); 105 } 106 else 107 { 108 if (inverted) 109 r ~= "!("; 110 111 CommaGen orCode = CommaGen(" || "); 112 113 foreach (interval; set.byInterval) 114 { 115 r ~= orCode(); 116 if (interval[0] + 1 == interval[1]) 117 r ~= text(varCode, " == '", interval[0].escapeCodePoint(false), "'"); 118 else 119 r ~= text("(", varCode, " >= '", interval[0].escapeCodePoint(false), 120 "' && ", varCode, " <= '", (interval[1] - 1).escapeCodePoint(false), "')"); 121 } 122 if (inverted) 123 r ~= ")"; 124 } 125 return r; 126 } 127 128 void generateTokenGraph(const EBNFGrammar lexerGrammar, Graph!(DCharToken, LexerAction) g, 129 CodepointSet*[] codepointSets, Symbol symbol, Graph!(DCharToken, LexerAction).NodeID start, 130 Graph!(DCharToken, LexerAction).NodeID end, EdgeFlags edgeFlags = EdgeFlags.none) 131 { 132 alias G = Graph!(DCharToken, LexerAction); 133 alias NodeID = G.NodeID; 134 135 string name = lexerGrammar.getSymbolName(symbol); 136 137 if (name.endsWith("\"")) 138 { 139 assert(name[$ - 1] == name[0]); 140 name = name[1 .. $ - 1]; 141 NodeID current = start; 142 size_t i; 143 UnescapedChar[] chars = name.byDchar.byUnescapedDchar.array; 144 if (edgeFlags != EdgeFlags.none) 145 enforce(chars.length); 146 foreach (k, d; chars) 147 { 148 scope (success) 149 i++; 150 NodeID next = g.addNode(text("x ", i)); 151 152 EdgeFlags edgeFlags2; 153 if (k == chars.length - 1) 154 { 155 edgeFlags2 = edgeFlags; 156 } 157 158 CodepointSet set = CodepointSet(d.c, d.c + 1); 159 foreach (codepointSet; codepointSets) 160 { 161 if (!(set & *codepointSet).empty) 162 g.addEdge(current, next, DCharToken(codepointSet), edgeFlags2); 163 } 164 current = next; 165 } 166 g.addEdge(current, end, DCharToken.invalid); 167 } 168 else if (name.endsWith("]")) 169 { 170 assert(name[0] == '['); 171 name = name[1 .. $ - 1]; 172 CodepointSet set = codepointSetFromStr(name); 173 foreach (codepointSet; codepointSets) 174 { 175 if (!(set & *codepointSet).empty) 176 g.addEdge(start, end, DCharToken(codepointSet), edgeFlags); 177 } 178 } 179 else if (name.endsWith("\'")) 180 { 181 assert(name[$ - 1] == name[0]); 182 name = name[1 .. $ - 1]; 183 throw new Exception("not implemented"); 184 } 185 else 186 assert(false); 187 } 188 189 void genGraphForNonterminal(const EBNFGrammar lexerGrammar, Graph!(DCharToken, LexerAction) g, 190 CodepointSet*[] codepointSets, NonterminalID currentNonterminal, Graph!(DCharToken, LexerAction) 191 .NodeID start, Graph!(DCharToken, LexerAction).NodeID end, 192 ref Graph!(DCharToken, LexerAction)[NonterminalID] graphByNonterminal) 193 { 194 alias G = Graph!(DCharToken, LexerAction); 195 alias NodeID = G.NodeID; 196 197 bool[Symbol] done; 198 void genSubgraph(Symbol symbol, NodeID start, NodeID end, 199 EdgeFlags firstEdgeFlags = EdgeFlags.none) 200 { 201 assert(!symbol.isToken); 202 if (symbol in done && done[symbol]) 203 throw new Exception(text("cycle ", lexerGrammar.getSymbolName(symbol))); 204 done[symbol] = true; 205 scope (success) 206 done[symbol] = false; 207 208 string name = lexerGrammar.getSymbolName(symbol); 209 210 if (name.endsWith("\"") || name.endsWith("]") || name.endsWith("\'")) 211 { 212 generateTokenGraph(lexerGrammar, g, codepointSets, symbol, start, end, firstEdgeFlags); 213 } 214 else if (name.startsWith("$tokenminus")) 215 { 216 size_t foundProductions; 217 foreach (p; lexerGrammar.productions) 218 { 219 if (p.nonterminalID != symbol.toNonterminalID) 220 continue; 221 foundProductions++; 222 223 assert(p.symbols.length == 2); 224 225 G dfa2 = new G(); 226 dfa2.start = dfa2.addNode("start"); 227 228 foreach (i, s; p.symbols) 229 { 230 if (s.isToken) 231 { 232 auto endM = dfa2.addNode("end"); 233 generateTokenGraph(lexerGrammar, dfa2, codepointSets, 234 s.toTokenID, dfa2.start, endM, EdgeFlags.none); 235 dfa2.get(endM).results = [ 236 LexerAction(NonterminalID.invalid, i > 0) 237 ]; 238 } 239 else 240 { 241 auto prevNodesLength = dfa2.nodes.length; 242 auto g1 = genGraphForNonterminal(lexerGrammar, 243 codepointSets, s.toNonterminalID, graphByNonterminal); 244 245 auto startM = dfa2.addCopy(g1)[0]; 246 foreach (j; prevNodesLength .. dfa2.nodes.length) 247 { 248 if (dfa2.nodes[j].results.length) 249 dfa2.nodes[j].results = [ 250 LexerAction(NonterminalID.invalid, i > 0) 251 ]; 252 } 253 dfa2.addEdge(dfa2.start, startM, DCharToken.invalid); 254 } 255 } 256 257 dfa2 = dfa2.makeDeterministic(dfa2.start); 258 dfa2 = dfa2.minimizeDFA; 259 foreach (j; 0 .. dfa2.nodes.length) 260 { 261 if (dfa2.nodes[j].results.length) 262 { 263 bool hasNotIgnored, hasIgnored; 264 foreach (r; dfa2.nodes[j].results) 265 if (r.isIgnoreToken) 266 hasIgnored = true; 267 else 268 hasNotIgnored = true; 269 if (!hasIgnored && hasNotIgnored) 270 dfa2.nodes[j].results = [ 271 LexerAction(NonterminalID.invalid) 272 ]; 273 else 274 dfa2.nodes[j].results = []; 275 } 276 } 277 dfa2 = dfa2.makeDeterministic(dfa2.start); 278 dfa2 = dfa2.removeDeadStates; 279 dfa2 = dfa2.minimizeDFA; 280 281 enforce(firstEdgeFlags == EdgeFlags.none || dfa2.get(dfa2.start).results.length == 0); 282 283 auto prevNodesLength = g.nodes.length; 284 285 auto startM = g.addCopy(dfa2)[0]; 286 g.addEdge(start, startM, DCharToken.invalid); 287 foreach (j; prevNodesLength .. g.nodes.length) 288 { 289 if (g.nodes[j].results.length) 290 { 291 g.nodes[j].results = []; 292 g.addEdge(g.nodeID(j), end, DCharToken.invalid); 293 } 294 } 295 foreach (ref e; g.get(startM).edges) 296 { 297 e.flags = firstEdgeFlags; 298 } 299 } 300 assert(foundProductions == 1, text(name, " foundProductions=", foundProductions)); 301 } 302 else 303 { 304 foreach (p; lexerGrammar.productions) 305 { 306 if (p.nonterminalID != symbol.toNonterminalID) 307 continue; 308 309 enforce(p.symbols.length != 1 || p.symbols[0] != symbol, text("Error: Symbol ", 310 lexerGrammar.getSymbolName(symbol), " recursively uses just itself")); 311 312 NodeID startP = g.addNode("start " ~ lexerGrammar.productionString(p)); 313 NodeID endP = g.addNode("end " ~ lexerGrammar.productionString(p)); 314 g.addEdge(endP, end, DCharToken.invalid); 315 316 NodeID current = startP; 317 bool connectStart = true, connectEnd = true; 318 319 if (p.annotations.contains!"recursiveLexer"()) 320 { 321 enforce(firstEdgeFlags == EdgeFlags.none); 322 323 foreach (i, s; p.symbols) 324 { 325 NodeID next = g.addNode("x"); 326 if (s.isToken) 327 generateTokenGraph(lexerGrammar, g, codepointSets, s, current, next); 328 else 329 { 330 g.addEdge(current, next, DCharToken(null, 331 EdgeFlags.none, p.symbols[i .. $])); 332 current = next; 333 break; 334 } 335 current = next; 336 } 337 if (connectStart) 338 g.addEdge(start, startP, DCharToken.invalid); 339 if (connectEnd) 340 g.addEdge(current, endP, DCharToken.invalid); 341 } 342 else 343 { 344 EdgeFlags nextEdgeFlags = firstEdgeFlags; 345 foreach (i, s; p.symbols) 346 { 347 EdgeFlags nextEdgeFlags2 = nextEdgeFlags; 348 nextEdgeFlags = EdgeFlags.none; 349 if (s.annotations.contains!"store" || s.annotations.contains!"compareTrue" 350 || s.annotations.contains!"compareFalse") 351 { 352 enforce(i || nextEdgeFlags2 == EdgeFlags.none); 353 enforce(s != symbol); 354 //enforce(i + 1 < p.symbols.length); 355 356 G dfa2; 357 if (s.isToken) 358 { 359 dfa2 = new G(); 360 dfa2.start = dfa2.addNode("start"); 361 auto endM = dfa2.addNode("end"); 362 generateTokenGraph(lexerGrammar, dfa2, codepointSets, 363 s.toTokenID, dfa2.start, endM, EdgeFlags.none); 364 dfa2.get(endM).results = [ 365 LexerAction(NonterminalID.invalid, true) 366 ]; 367 } 368 else 369 { 370 dfa2 = genGraphForNonterminal(lexerGrammar, 371 codepointSets, s.toNonterminalID, graphByNonterminal); 372 } 373 374 NodeID next = g.addNode("x"); 375 376 auto prevNodesLength = g.nodes.length; 377 auto startM = g.addCopy(dfa2)[0]; 378 auto newNodesLength = g.nodes.length; 379 foreach (j; prevNodesLength .. newNodesLength) 380 { 381 foreach (ref e; g.nodes[j].edges) 382 { 383 if (!s.annotations.contains!"store") 384 e.flags |= EdgeFlags.compareMiddle; 385 } 386 } 387 foreach (ref e; g.get(startM).edges) 388 { 389 if (g.get(e.next).edges.length) 390 g.addEdge(current, e.next, e.symbol, s.annotations.contains!"store" 391 ? EdgeFlags.storeStart : EdgeFlags.compareStart); 392 if (g.get(e.next).results.length) 393 g.addEdge(current, next, e.symbol, s.annotations.contains!"store" 394 ? EdgeFlags.storeStart 395 : (EdgeFlags.compareStart | (s.annotations.contains!"compareTrue" 396 ? EdgeFlags.compareEndTrue 397 : EdgeFlags.compareEndFalse))); 398 } 399 foreach (j; prevNodesLength .. newNodesLength) 400 { 401 foreach (e; g.nodes[j].edges) 402 { 403 if (g.get(e.next).results.length) 404 { 405 g.addEdge(g.nodeID(j), next, e.symbol, 406 s.annotations.contains!"store" ? EdgeFlags.none 407 : s.annotations.contains!"compareTrue" 408 ? EdgeFlags.compareEndTrue 409 : EdgeFlags.compareEndFalse); 410 } 411 } 412 } 413 foreach (j; prevNodesLength .. newNodesLength) 414 { 415 g.nodes[j].results = []; 416 } 417 418 if (s.annotations.contains!"store") 419 { 420 NodeID next2 = g.addNode("x"); 421 g.addEdge(next, next2, DCharToken(null, EdgeFlags.storeEnd)); 422 next = next2; 423 } 424 425 current = next; 426 } 427 else if (i == p.symbols.length - 1 && s == symbol 428 && firstEdgeFlags != EdgeFlags.none) 429 { 430 done[symbol] = false; 431 NodeID next = g.addNode("x"); 432 if (s.isToken) 433 generateTokenGraph(lexerGrammar, g, codepointSets, 434 s, current, next, EdgeFlags.none); 435 else 436 genSubgraph(s, current, next, EdgeFlags.none); 437 current = next; 438 done[symbol] = true; 439 } 440 else if (s == symbol) 441 { 442 if (i != 0 && i != p.symbols.length - 1) 443 throw new Exception("not regular"); 444 445 if (i == 0) 446 { 447 g.addEdge(end, current, DCharToken.invalid, EdgeFlags.backwards); 448 connectStart = false; 449 } 450 if (i == p.symbols.length - 1) 451 { 452 g.addEdge(current, start, DCharToken.invalid, EdgeFlags.backwards); 453 connectEnd = false; 454 } 455 } 456 else 457 { 458 NodeID next = g.addNode("x"); 459 if (s.isToken) 460 generateTokenGraph(lexerGrammar, g, codepointSets, 461 s, current, next, nextEdgeFlags2); 462 else 463 genSubgraph(s, current, next, nextEdgeFlags2); 464 current = next; 465 } 466 } 467 enforce(nextEdgeFlags == EdgeFlags.none, 468 text("Empty lexer rule used with flags ", nextEdgeFlags)); 469 470 if (connectStart) 471 g.addEdge(start, startP, DCharToken.invalid); 472 if (connectEnd) 473 g.addEdge(current, endP, DCharToken.invalid); 474 } 475 476 foreach (x; p.negLookaheads) 477 { 478 auto endNegLookahead = g.addNode( 479 "neglookahead " ~ lexerGrammar.productionString(p) ~ " " ~ text(x)); 480 if (x.isToken) 481 generateTokenGraph(lexerGrammar, g, codepointSets, x, 482 endP, endNegLookahead); 483 else 484 genSubgraph(x, endP, endNegLookahead); 485 g.get(endNegLookahead).results = [ 486 LexerAction(currentNonterminal, false, true) 487 ]; 488 } 489 } 490 } 491 } 492 493 genSubgraph(currentNonterminal, start, end); 494 } 495 496 Graph!(DCharToken, LexerAction) genGraphForNonterminal(const EBNFGrammar lexerGrammar, CodepointSet*[] codepointSets, 497 NonterminalID currentNonterminal, ref Graph!(DCharToken, 498 LexerAction)[NonterminalID] graphByNonterminal) 499 { 500 alias G = Graph!(DCharToken, LexerAction); 501 alias NodeID = G.NodeID; 502 503 if (currentNonterminal in graphByNonterminal) 504 { 505 enforce(graphByNonterminal[currentNonterminal]!is null); 506 return graphByNonterminal[currentNonterminal]; 507 } 508 509 graphByNonterminal[currentNonterminal] = null; 510 511 G dfa2 = new G(); 512 dfa2.start = dfa2.addNode("start"); 513 514 NodeID endM = dfa2.addNode("end"); 515 dfa2.get(endM).results = [LexerAction(currentNonterminal)]; 516 517 genGraphForNonterminal(lexerGrammar, dfa2, codepointSets, 518 currentNonterminal, dfa2.start, endM, graphByNonterminal); 519 520 dfa2 = dfa2.makeDeterministic(dfa2.start); 521 dfa2 = dfa2.minimizeDFA; 522 graphByNonterminal[currentNonterminal] = dfa2; 523 return dfa2; 524 } 525 526 class SubAutomaton 527 { 528 immutable(SymbolInstance)[] recursiveSequence; 529 Graph!(DCharToken, LexerAction) dfa; 530 } 531 532 void generateStateMachine(ref CodeWriter code, const EBNFGrammar lexerGrammar, 533 Graph!(DCharToken, LexerAction) dfa, bool inSubAutomaton, SubAutomaton[] subAutomatons) 534 { 535 alias G = Graph!(DCharToken, LexerAction); 536 alias NodeID = G.NodeID; 537 538 class EdgeData 539 { 540 CodepointSet set; 541 const(SymbolInstance)[] recursiveSequence; 542 G.NodeID next, next2, next3; 543 LexerAction[] saveResult; 544 bool isEndEdge; 545 EdgeFlags flags; 546 } 547 548 EdgeData[][G.NodeID] edgesPerNode; 549 CodepointSet[G.NodeID] allAllowedSets; 550 foreach (n; dfa.nodeIDs) 551 { 552 EdgeData[] edges; 553 CodepointSet allAllowedSet; 554 bool hasRecursiveLexer; 555 foreach (e; dfa.getEdges(n)) 556 { 557 EdgeFlags flags = e.flags; 558 EdgeData x; 559 foreach (x2; edges) 560 if (e.symbol.codepoints !is null && x2.set == *e.symbol.codepoints) 561 { 562 enforce((flags & ~EdgeFlags.compareFlags) == (x2.flags & ~EdgeFlags.compareFlags), 563 text(n, flagsToString(flags), " ", 564 flagsToString(x2.flags), " ", x2.set)); 565 flags |= x2.flags; 566 x = x2; 567 } 568 if (x !is null) 569 { 570 if (e.symbol.codepoints !is null) 571 x.set |= *e.symbol.codepoints; 572 x.recursiveSequence = e.symbol.recursiveSequence; 573 x.flags |= flags; 574 if (e.symbol.recursiveSequence.length) 575 hasRecursiveLexer = true; 576 allAllowedSet |= x.set; 577 } 578 else 579 { 580 x = new EdgeData(); 581 if (e.symbol.codepoints !is null) 582 x.set = *e.symbol.codepoints; 583 x.recursiveSequence = e.symbol.recursiveSequence; 584 if (e.symbol.recursiveSequence.length) 585 hasRecursiveLexer = true; 586 x.flags = flags; 587 allAllowedSet |= x.set; 588 edges ~= x; 589 } 590 591 if (e.flags & EdgeFlags.compareEndTrue) 592 { 593 x.next3 = e.next; 594 } 595 else if ((e.flags & EdgeFlags.compareFlags) && !(e.flags & EdgeFlags.compareEndFalse)) 596 { 597 x.next2 = e.next; 598 } 599 else 600 { 601 x.next = e.next; 602 } 603 } 604 if (hasRecursiveLexer) 605 { 606 enforce(edges.length == 1); 607 } 608 else 609 { 610 edges ~= new EdgeData(); 611 edges[$ - 1].set = allAllowedSet.inverted; 612 edges[$ - 1].isEndEdge = true; 613 } 614 edgesPerNode[n] = edges; 615 allAllowedSets[n] = allAllowedSet; 616 617 /*foreach (e; edges) 618 { 619 writeln(n, e.next, e.next2, " ", codepointSetToStr(e.set), " ", e.recursiveSequence); 620 }*/ 621 } 622 623 // Merge similar edges 624 foreach (n; dfa.nodeIDs) 625 { 626 if (edgesPerNode[n].length == 1 && edgesPerNode[n][0].recursiveSequence.length) 627 continue; 628 629 struct Key 630 { 631 G.NodeID next; 632 G.NodeID next2; 633 G.NodeID next3; 634 bool isEndEdge; 635 EdgeFlags flags; 636 int opCmp(const Key other) const 637 { 638 if (isEndEdge != other.isEndEdge) 639 { 640 if (other.isEndEdge) 641 return -1; 642 if (isEndEdge) 643 return 1; 644 } 645 if (flags < other.flags) 646 return -1; 647 if (flags > other.flags) 648 return 1; 649 int r = next.opCmp(other.next); 650 if (r) 651 return r; 652 r = next2.opCmp(other.next2); 653 if (r) 654 return r; 655 r = next3.opCmp(other.next3); 656 return r; 657 } 658 } 659 660 DCharToken[Key] edgesMap; 661 EdgeData[] edges; 662 foreach (e; edgesPerNode[n]) 663 { 664 Key key = Key(e.next, e.next2, e.next3, e.isEndEdge, e.flags); 665 if (key !in edgesMap) 666 { 667 edgesMap[key] = DCharToken(new CodepointSet()); 668 } 669 *edgesMap[key].codepoints |= e.set; 670 } 671 foreach (key; edgesMap.sortedKeys) 672 { 673 auto e = edgesMap[key]; 674 EdgeData e2 = new EdgeData(); 675 e2.next = key.next; 676 e2.next2 = key.next2; 677 e2.next3 = key.next3; 678 e2.isEndEdge = key.isEndEdge; 679 e2.flags = key.flags; 680 e2.set = *e.codepoints; 681 edges ~= e2; 682 } 683 edgesPerNode[n] = edges; 684 } 685 686 LexerAction[][G.NodeID] reachableResultsPerNode; 687 foreach (n; dfa.nodeIDs) 688 { 689 reachableResultsPerNode[n] = reachableResults(dfa, n); 690 } 691 692 // Calculate, which results have to be saved, because it's 693 // not known if there will be a longer token. 694 foreach (n; dfa.nodeIDs) 695 { 696 foreach (e; edgesPerNode[n]) 697 { 698 if (e.isEndEdge) 699 continue; 700 bool[LexerAction] resNext; 701 bool[LexerAction] reachableNext; 702 if (e.next != NodeID.invalid) 703 { 704 foreach (res; dfa.get(e.next).results) 705 { 706 resNext[res] = true; 707 } 708 foreach (res; reachableResultsPerNode[e.next]) 709 { 710 reachableNext[res] = true; 711 } 712 } 713 if (e.next2 != NodeID.invalid) 714 { 715 bool[LexerAction] resNext2 = resNext; 716 resNext = null; 717 foreach (res; dfa.get(e.next2).results) 718 { 719 if (e.next == NodeID.invalid || res in resNext2) 720 resNext[res] = true; 721 } 722 foreach (res; reachableResultsPerNode[e.next2]) 723 { 724 reachableNext[res] = true; 725 } 726 } 727 foreach (res; dfa.get(n).results) 728 { 729 if (res.isNegLookahead) 730 continue; 731 if (LexerAction(res.nonterminal, res.isIgnoreToken, true) in reachableNext) 732 continue; 733 if (res !in resNext) 734 e.saveResult.addOnce(res); 735 } 736 } 737 } 738 739 CodepointSet[][G.NodeID] shortestPaths; 740 void findShortestPaths(CodepointSet[] path, G.NodeID node) 741 { 742 if (node !in shortestPaths || path.length < shortestPaths[node].length) 743 { 744 shortestPaths[node] = path; 745 CodepointSet[G.NodeID] edges; 746 foreach (e; edgesPerNode[node]) 747 { 748 if (e.isEndEdge) 749 continue; 750 if (e.next.id != size_t.max) 751 { 752 if (e.next !in edges) 753 edges[e.next] = e.set; 754 else 755 edges[e.next] |= e.set; 756 } 757 if (e.next2.id != size_t.max) 758 { 759 if (e.next2 !in edges) 760 edges[e.next2] = e.set; 761 else 762 edges[e.next2] |= e.set; 763 } 764 } 765 foreach (n; edges.sortedKeys) 766 { 767 findShortestPaths(path ~ edges[n], n); 768 } 769 } 770 } 771 772 findShortestPaths([], dfa.start); 773 774 bool[G.NodeID] nodesWithCompare; 775 foreach (n; dfa.nodeIDs) 776 { 777 foreach (e; dfa.getEdges(n)) 778 { 779 if (e.flags & (EdgeFlags.compareStart | EdgeFlags.compareMiddle)) 780 nodesWithCompare[e.next] = true; 781 } 782 } 783 784 void genCodeForNode(G.NodeID n) 785 { 786 string resultsComment; 787 bool[LexerAction] resultDone; 788 foreach (res; dfa.get(n).results) 789 { 790 resultDone[res] = true; 791 resultsComment ~= " " ~ (res.isNegLookahead 792 ? "!" : "") ~ lexerGrammar.getSymbolName(res.nonterminal); 793 } 794 auto tmpReachableResults = reachableResultsPerNode[n]; 795 sort!((a, b) => a.nonterminal.id < b.nonterminal.id)(tmpReachableResults); 796 foreach (res; tmpReachableResults) 797 { 798 if (res in resultDone) 799 continue; 800 resultsComment ~= " (" ~ (res.isNegLookahead 801 ? "!" : "") ~ lexerGrammar.getSymbolName(res.nonterminal) ~ ")"; 802 } 803 code.writeln("//", resultsComment); 804 805 code.write("// path:"); 806 if (n in shortestPaths) 807 foreach (s; shortestPaths[n]) 808 { 809 code.write(" ", codepointSetToStr(s)); 810 } 811 code.writeln(); 812 813 if (n == dfa.start) 814 code.decIndent.writeln("start:").incIndent; 815 816 auto edges = edgesPerNode[n]; 817 CodepointSet allAllowedSet = allAllowedSets[n]; 818 819 EdgeData findBestNextEdge(CodepointSet codepointsStillPossible) 820 { 821 EdgeData best; 822 foreach (e; edges) 823 { 824 CodepointSet eSet = e.set & codepointsStillPossible; 825 CodepointSet eSet2 = restrictToAssumedCodepoints(e.set, codepointsStillPossible); 826 if (eSet.empty) 827 continue; 828 if (best is null) 829 { 830 best = e; 831 continue; 832 } 833 CodepointSet bestSet = best.set & codepointsStillPossible; 834 CodepointSet bestSet2 = restrictToAssumedCodepoints(best.set, 835 codepointsStillPossible); 836 837 if (eSet2.byInterval.length < bestSet2.byInterval.length) 838 best = e; 839 else if (eSet2.byInterval.length > bestSet2.byInterval.length) 840 continue; 841 else if (eSet.length < bestSet.length) 842 best = e; 843 else if (eSet.length > bestSet.length) 844 continue; 845 else if (best.isEndEdge) 846 best = e; 847 else 848 { 849 auto eIntervals = eSet.byInterval; 850 auto bestIntervals = bestSet.byInterval; 851 while (!eIntervals.empty && !bestIntervals.empty) 852 { 853 if (eIntervals.front[0] < bestIntervals.front[0]) 854 { 855 best = e; 856 break; 857 } 858 else if (eIntervals.front[0] > bestIntervals.front[0]) 859 { 860 break; 861 } 862 else if (eIntervals.front[1] < bestIntervals.front[1]) 863 { 864 best = e; 865 break; 866 } 867 else if (eIntervals.front[1] > bestIntervals.front[1]) 868 { 869 break; 870 } 871 eIntervals.popFront; 872 bestIntervals.popFront; 873 } 874 } 875 } 876 return best; 877 } 878 879 int getNonterminalPriority(NonterminalID nonterminal) 880 { 881 if (nonterminal == NonterminalID.invalid) 882 return 0; 883 if (lexerGrammar.nonterminals[nonterminal].annotations.contains!"ignoreToken"()) 884 return 1; 885 if (lexerGrammar.nonterminals[nonterminal].annotations.contains!"lowPrio"()) 886 return 2; 887 return 3; 888 } 889 890 const(LexerAction)[] filterActionConflicts(const(LexerAction)[] resultsAll) 891 { 892 LexerAction[] best; 893 894 const(LexerAction)[] results; 895 const(LexerAction)[] resultsNegLookahead; 896 foreach (res; resultsAll) 897 if (res.isNegLookahead) 898 resultsNegLookahead ~= res; 899 900 foreach (res; resultsAll) 901 if (!res.isNegLookahead) 902 { 903 bool hasNegLookahead; 904 foreach (res2; resultsNegLookahead) 905 if (res.nonterminal == res2.nonterminal) 906 hasNegLookahead = true; 907 if (!hasNegLookahead) 908 results ~= res; 909 } 910 911 if (results.length <= 1) 912 return results; 913 914 foreach (res; results) 915 { 916 int resPriority = getNonterminalPriority(res.nonterminal); 917 int bestPriority = (best.length > 0) ? getNonterminalPriority(best[0].nonterminal) 918 : 0; 919 if (resPriority > bestPriority) 920 best = [res]; 921 else if (resPriority == bestPriority) 922 best ~= res; // conflict 923 } 924 if (best.length > 1 925 && lexerGrammar.nonterminals[best[0].nonterminal].annotations.contains!"ignoreToken"()) 926 best.length = 1; 927 assert(best.length > 0); 928 return best; 929 } 930 931 void genLexCode(bool ascii) 932 { 933 mixin(genCode("code", q{ 934 $$CodepointSet currentAllowedCodepoints = ascii ? unicode.ASCII : (unicode.ASCII.inverted); 935 $$if (!ascii /*&& !(allAllowedSet & currentAllowedCodepoints).empty*/) { 936 string inputCopyNext = inputCopy; 937 import std.utf; 938 939 dchar currentDchar = decodeFront!(Yes.useReplacementDchar)(inputCopyNext); 940 $$} 941 $$bool needsElse = false; 942 $$bool elseStillAllowed = true; 943 $$CodepointSet codepointsStillPossible = currentAllowedCodepoints; 944 $$while (true) { 945 $$EdgeData e = findBestNextEdge(codepointsStillPossible); 946 $$if (e is null) break; 947 $$if ((e.set & codepointsStillPossible).empty) continue; 948 $$assert(elseStillAllowed); 949 $(needsElse?"else":"") _ 950 $$if ((codepointsStillPossible - e.set).empty) { 951 $("") 952 $$elseStillAllowed = false; 953 $$} else { 954 $(needsElse?" ":"")if ($(codepointSetToCode(ascii?"currentChar":"currentDchar", e.set, codepointsStillPossible))) 955 $$} 956 { 957 $$if (e.isEndEdge) { 958 $$if (dfa.get(n).results.length > 0) { 959 goto endstate$(n.id); 960 $$} else if ((allAllowedSet.inverted & currentAllowedCodepoints).empty) { 961 assert(false); 962 $$} else { 963 $$if (inSubAutomaton) { 964 if (hasPreviousSymbol) 965 return false; 966 $$} else { 967 if (foundSymbol != SymbolID.max) 968 goto lexerend; 969 $$} 970 else 971 throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 972 $$} 973 $$} else { 974 $$needsElse = true; 975 $$if (e.saveResult.length > 0) { 976 $$auto filteredActionConflicts2 = filterActionConflicts(e.saveResult); 977 $$if (filteredActionConflicts2.length > 1) { 978 foundSymbol = SymbolID.max; 979 $$} else { 980 assert(inputCopy.ptr >= input.ptr); 981 foundSymbol = tokenID!"$(lexerGrammar.getSymbolName(filteredActionConflicts2[0].nonterminal).escapeD)"; 982 foundLength = inputCopy.ptr - input.ptr; 983 foundIsIgnore = $(filteredActionConflicts2[0].isIgnoreToken); 984 $$} 985 $$} 986 $$if (e.flags & EdgeFlags.storeEnd) { 987 assert(storedStart != size_t.max); 988 storedString = input[storedStart .. inputCopy.ptr - input.ptr]; 989 storedStart = size_t.max; 990 $$} 991 $$if (e.flags & (EdgeFlags.compareStart | EdgeFlags.storeStart)) { 992 assert(storedStart == size_t.max); 993 storedStart = inputCopy.ptr - input.ptr; 994 $$} 995 $$if (e.flags & EdgeFlags.compareFlags) { 996 assert(storedStart != size_t.max); 997 string compareString = input[storedStart .. $(ascii ? "inputCopy.ptr + 1": "inputCopyNext.ptr") - input.ptr]; 998 bool currentCharCorrect = compareString.length <= storedString.length && $(ascii?"currentChar == storedString[compareString.length - 1]" : "inputCopy[0 .. inputCopyNext.ptr - inputCopy.ptr] == storedString[compareString.length - (inputCopyNext.ptr - inputCopy.ptr)..compareString.length]"); 999 $$} 1000 $$if (ascii) { 1001 inputCopy = inputCopy[1 .. $]; 1002 $$} else { 1003 inputCopy = inputCopyNext; 1004 $$} 1005 $$if (e.flags & (EdgeFlags.compareFlags)) { 1006 if (compareString.length == storedString.length && currentCharCorrect) 1007 { 1008 $$if (e.flags & (EdgeFlags.compareEndTrue | EdgeFlags.compareEndFalse)) { 1009 assert(compareString == storedString); 1010 storedStart = size_t.max; 1011 $$} 1012 $$if (e.next3.id == size_t.max) { 1013 throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 1014 $$} else { 1015 goto state$(e.next3.id); 1016 $$} 1017 } 1018 else if (compareString.length < storedString.length && currentCharCorrect) 1019 { 1020 $$if (e.next2.id == size_t.max) { 1021 throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 1022 $$} else { 1023 goto state$(e.next2.id); 1024 $$} 1025 } 1026 else 1027 { 1028 storedStart = size_t.max; 1029 $$if (e.next.id == size_t.max) { 1030 throw lexerException(text("Error unexpected \'", $(ascii?"currentChar":"currentDchar"), "\'"), "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 1031 $$} else { 1032 goto state$(e.next.id); 1033 $$} 1034 } 1035 $$} else { 1036 $$if (n in nodesWithCompare) { 1037 storedStart = size_t.max; 1038 $$} 1039 goto state$(e.next.id); 1040 $$} 1041 $$} 1042 } 1043 $$codepointsStillPossible = codepointsStillPossible - e.set; 1044 $$} 1045 $$assert(!elseStillAllowed); 1046 })); 1047 } 1048 1049 code.writeln("{").incIndent; 1050 if (edges.length == 1 && edges[0].recursiveSequence.length) 1051 { 1052 code.writeln("// RecursiveLexer"); 1053 SubAutomaton a; 1054 size_t i; 1055 foreach (i2, a2; subAutomatons) 1056 if (a2.recursiveSequence == edges[0].recursiveSequence) 1057 { 1058 a = a2; 1059 i = i2; 1060 break; 1061 } 1062 code.writeln("if (lexPart", i, "(inputCopy, ", inSubAutomaton 1063 ? "hasPreviousSymbol" : "foundSymbol != SymbolID.max", "))"); 1064 code.writeln(" goto state", edges[0].next.id, ";"); 1065 code.writeln("else"); 1066 if (inSubAutomaton) 1067 code.writeln(" return false;"); 1068 else 1069 code.writeln(" goto lexerend;"); 1070 } 1071 else 1072 { 1073 mixin(genCode("code", q{ 1074 if (inputCopy.length == 0) 1075 $$if (dfa.get(n).results.length >= 1) { 1076 $$if (inSubAutomaton) { 1077 $$enforce(dfa.getEdges(n).length == 0); 1078 return true; 1079 $$} else { 1080 goto endstate$(n.id); 1081 $$} 1082 $$} else { 1083 { 1084 $$if (inSubAutomaton) { 1085 if (input.ptr == inputCopy.ptr) 1086 return false; 1087 else if (hasPreviousSymbol) 1088 return false; 1089 $$} else { 1090 if (input.ptr == inputCopy.ptr) 1091 goto lexerend; 1092 else if (foundSymbol != SymbolID.max) 1093 goto lexerend; 1094 $$} 1095 else 1096 throw lexerException("EOF", "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 1097 } 1098 $$} 1099 $$if (!allAllowedSet.empty) { 1100 char currentChar = inputCopy[0]; 1101 if (currentChar < 0x80) 1102 { 1103 $$genLexCode(true); 1104 } 1105 else 1106 { 1107 $$genLexCode(false); 1108 } 1109 $$} else { 1110 $$if (dfa.get(n).results.length > 0) { 1111 $$if (inSubAutomaton) { 1112 return true; 1113 $$} else { 1114 goto endstate$(n.id); 1115 $$} 1116 $$} else { 1117 $$if (inSubAutomaton) { 1118 if (hasPreviousSymbol) 1119 return false; 1120 $$} else { 1121 if (foundSymbol != SymbolID.max) 1122 goto lexerend; 1123 $$} 1124 else 1125 throw lexerException("Error", "$(codepointSetToStr(allAllowedSet).escapeD)", inputCopy.ptr - input.ptr); 1126 $$} 1127 $$} 1128 })); 1129 } 1130 code.decIndent.writeln("}"); 1131 1132 if (dfa.get(n).results.length > 0) 1133 code.writeln("endstate", n.id, ":"); 1134 1135 void genEndState(const(LexerAction)[] results) 1136 { 1137 auto filteredActionConflicts = filterActionConflicts(results); 1138 1139 if (filteredActionConflicts.length == 0) 1140 { 1141 mixin(genCode("code", q{ 1142 { 1143 if (foundSymbol != SymbolID.max) 1144 goto lexerend; 1145 else 1146 throw lexerException("Error", "", inputCopy.ptr - input.ptr); 1147 } 1148 })); 1149 } 1150 else if (filteredActionConflicts.length > 1) 1151 { 1152 auto conflictNames = filteredActionConflicts.map!( 1153 res => lexerGrammar.getSymbolName(res.nonterminal).escapeD).join(' '); 1154 stderr.writeln("Warning: Lexer conflict for ", conflictNames); 1155 mixin(genCode("code", q{ 1156 throw lexerException("conflict $(conflictNames)", "", inputCopy.ptr - input.ptr); 1157 })); 1158 } 1159 else if (filteredActionConflicts.length == 1) 1160 { 1161 mixin(genCode("code", q{ 1162 { 1163 assert(inputCopy.ptr >= input.ptr); 1164 foundSymbol = tokenID!"$(lexerGrammar.getSymbolName(filteredActionConflicts[0].nonterminal).escapeD)"; 1165 foundLength = inputCopy.ptr - input.ptr; 1166 foundIsIgnore = $(filteredActionConflicts[0].isIgnoreToken); 1167 goto lexerend; 1168 } 1169 })); 1170 } 1171 } 1172 1173 void genEndStateRek(const(LexerAction)[] resultsIn, const(LexerAction)[] results) 1174 { 1175 if (resultsIn.length == 0) 1176 genEndState(results); 1177 else if ( 1178 lexerGrammar.nonterminals[resultsIn[0].nonterminal].annotations.contains!"inContextOnly"()) 1179 { 1180 mixin(genCode("code", q{ 1181 if (allowToken!$(lexerGrammar.nonterminals[resultsIn[0].nonterminal].name.tokenDCode)) 1182 $$genEndStateRek(resultsIn[1 .. $], results ~ resultsIn[0]); 1183 else 1184 $$genEndStateRek(resultsIn[1 .. $], results); 1185 })); 1186 } 1187 else 1188 genEndStateRek(resultsIn[1 .. $], results ~ resultsIn[0]); 1189 } 1190 1191 if (!inSubAutomaton && dfa.get(n).results.length > 0) 1192 genEndStateRek(dfa.get(n).results, []); 1193 1194 code.writeln(); 1195 } 1196 1197 foreach (n; dfa.nodeIDs) 1198 { 1199 code.decIndent.writeln("state", n.id, ":").incIndent; 1200 genCodeForNode(n); 1201 } 1202 } 1203 1204 void removeMinimalMatchEdges(Graph!(DCharToken, LexerAction) g, EBNFGrammar lexerGrammar) 1205 { 1206 foreach (n; g.nodeIDs) 1207 { 1208 if (g.get(n).results.length == 0) 1209 continue; 1210 bool allMinimalMatch = true; 1211 foreach (res; g.get(n).results) 1212 if (!lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"minimalMatch") 1213 allMinimalMatch = false; 1214 if (allMinimalMatch) 1215 { 1216 g.get(n).edges = []; 1217 } 1218 } 1219 } 1220 1221 const(char)[] createLexerCode(EBNFGrammar lexerGrammar, string modulename, string dfafilename) 1222 { 1223 CodeWriter code; 1224 code.indentStr = " "; 1225 1226 string allTokensCode = createAllTokensCode(lexerGrammar); 1227 string allNonterminalsCode = createAllNonterminalsCode(lexerGrammar); 1228 1229 alias G = Graph!(DCharToken, LexerAction); 1230 alias NodeID = G.NodeID; 1231 1232 G graph = new G(); 1233 1234 string symbolName(Symbol s) 1235 { 1236 return lexerGrammar.getSymbolName(s); 1237 } 1238 1239 string tokenName(DCharToken s) 1240 { 1241 return codepointSetToStr(*s.codepoints); 1242 } 1243 1244 string edgeText(G.Edge[] edges) 1245 { 1246 string r; 1247 1248 CodepointSet set; 1249 1250 bool hasEmpty; 1251 1252 foreach (e; edges) 1253 { 1254 if (e.symbol.recursiveSequence.length) 1255 r ~= text("RecursiveLexer"); 1256 else if (e.symbol.codepoints is null) 1257 hasEmpty = true; 1258 else 1259 set |= *e.symbol.codepoints; 1260 } 1261 1262 if (hasEmpty) 1263 { 1264 r ~= "ε"; 1265 } 1266 1267 if (!hasEmpty || !set.empty) 1268 { 1269 if (r.length > 0) 1270 r ~= "\n"; 1271 1272 r ~= codepointSetToStr(set); 1273 } 1274 EdgeFlags combinedFlags; 1275 foreach (e; edges) 1276 { 1277 combinedFlags |= e.flags | e.symbol.specialFlags; 1278 } 1279 if (combinedFlags & EdgeFlags.storeStart) 1280 r ~= text("\nStoreStart"); 1281 if (combinedFlags & EdgeFlags.storeEnd) 1282 r ~= text("\nStoreEnd"); 1283 if (combinedFlags & EdgeFlags.compareStart) 1284 r ~= text("\nCompareStart"); 1285 if (combinedFlags & EdgeFlags.compareEndTrue) 1286 r ~= text("\nCompareEndTrue"); 1287 if (combinedFlags & EdgeFlags.compareEndFalse) 1288 r ~= text("\nCompareEndFalse"); 1289 if (combinedFlags & EdgeFlags.compareMiddle) 1290 r ~= text("\nCompareMiddle"); 1291 return r; 1292 } 1293 1294 string resultsText(G graph, G.NodeID node) 1295 { 1296 string r; 1297 1298 string resultName(LexerAction res) 1299 { 1300 string r; 1301 if (res.nonterminal != NonterminalID.invalid) 1302 { 1303 if (res.isNegLookahead) 1304 r ~= "!"; 1305 r ~= text(lexerGrammar.getSymbolName(res.nonterminal)); 1306 if ( 1307 lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"ignoreToken"()) 1308 r ~= " IgnoreToken"; 1309 } 1310 else 1311 r ~= "invalid"; 1312 1313 if (res.isIgnoreToken) 1314 r ~= " ignore"; 1315 return r; 1316 } 1317 1318 bool[LexerAction] done; 1319 foreach (res; graph.get(node).results) 1320 { 1321 done[res] = true; 1322 r ~= text(resultName(res), "\n"); 1323 } 1324 foreach (res; reachableResults(graph, node)) 1325 { 1326 if (res in done) 1327 continue; 1328 r ~= text("(", resultName(res), ")\n"); 1329 } 1330 1331 return r; 1332 } 1333 1334 CodepointSet*[] codepointSets; 1335 void addCodepointSet(CodepointSet set) 1336 { 1337 for (size_t i = 0; i < codepointSets.length; i++) 1338 { 1339 if (set == *codepointSets[i]) 1340 return; 1341 if (!(set & *codepointSets[i]).empty) 1342 { 1343 CodepointSet both = set & *codepointSets[i]; 1344 CodepointSet first = *codepointSets[i] - set; 1345 set = set - *codepointSets[i]; 1346 *codepointSets[i] = both; 1347 if (!first.empty) 1348 codepointSets ~= new CodepointSet(first); 1349 } 1350 } 1351 if (!set.empty) 1352 { 1353 codepointSets ~= new CodepointSet(set); 1354 } 1355 } 1356 1357 void updateCodepointSets(string name) 1358 { 1359 if (name.endsWith("\"")) 1360 { 1361 assert(name[$ - 1] == name[0], name); 1362 name = name[1 .. $ - 1]; 1363 foreach (d; name.byDchar.byUnescapedDchar) 1364 { 1365 addCodepointSet(CodepointSet(d.c, d.c + 1)); 1366 } 1367 } 1368 else if (name.endsWith("]")) 1369 { 1370 assert(name[0] == '[', name); 1371 name = name[1 .. $ - 1]; 1372 CodepointSet set = codepointSetFromStr(name); 1373 addCodepointSet(set); 1374 } 1375 else if (name.endsWith("\'")) 1376 { 1377 throw new Exception("not implemented"); 1378 } 1379 } 1380 1381 foreach (n; lexerGrammar.startNonterminals) 1382 { 1383 string name = lexerGrammar.nonterminals[n.nonterminal].name; 1384 updateCodepointSets(name); 1385 } 1386 foreach (p; lexerGrammar.productions) 1387 { 1388 foreach (i, s; p.symbols) 1389 { 1390 string name = lexerGrammar.getSymbolName(s); 1391 updateCodepointSets(name); 1392 } 1393 } 1394 1395 { 1396 CodepointSet combined; 1397 foreach (set; codepointSets) 1398 { 1399 assert((combined & *set).empty); 1400 combined |= *set; 1401 } 1402 } 1403 1404 graph.start = graph.addNode("start"); 1405 1406 Graph!(DCharToken, LexerAction)[NonterminalID] graphByNonterminal; 1407 1408 foreach (n; lexerGrammar.nonterminals.allIDs()) 1409 { 1410 if (!lexerGrammar.nonterminals[n].annotations.contains!"ignoreToken"()) 1411 continue; 1412 1413 string name = lexerGrammar.nonterminals[n].name; 1414 NodeID startM = graph.addNode("start " ~ name); 1415 graph.addEdge(graph.start, startM, DCharToken.invalid); 1416 NodeID endM = graph.addNode("end"); 1417 graph.get(endM).results = [LexerAction(n, true)]; 1418 1419 genGraphForNonterminal(lexerGrammar, graph, codepointSets, n, startM, 1420 endM, graphByNonterminal); 1421 } 1422 1423 foreach (n; lexerGrammar.startNonterminals) 1424 { 1425 string name = lexerGrammar.nonterminals[n.nonterminal].name; 1426 NodeID startM = graph.addNode("start " ~ name); 1427 graph.addEdge(graph.start, startM, DCharToken.invalid); 1428 NodeID endM = graph.addNode("end"); 1429 graph.get(endM).results = [LexerAction(n.nonterminal)]; 1430 1431 genGraphForNonterminal(lexerGrammar, graph, codepointSets, 1432 n.nonterminal, startM, endM, graphByNonterminal); 1433 } 1434 1435 foreach (n; graph.nodeIDs) 1436 { 1437 graph.get(n).name = text(n.id); 1438 } 1439 1440 graph = graph.makeDeterministic(graph.start); 1441 1442 foreach (n; graph.nodeIDs) 1443 { 1444 G.Node node = graph.get(n); 1445 G.Edge[] edges = node.edges; 1446 node.edges = []; 1447 1448 foreach (e; edges) 1449 { 1450 assert((e.symbol.codepoints !is null) + ( 1451 e.symbol.specialFlags != EdgeFlags.none) + ( 1452 e.symbol.recursiveSequence.length != 0) == 1); 1453 if (e.symbol.codepoints is null && e.symbol.recursiveSequence.length == 0) 1454 { 1455 assert(e.next != n); 1456 foreach (e2; graph.get(e.next).edges) 1457 { 1458 assert(e2.symbol.codepoints !is null); 1459 graph.addEdge(n, e2.next, e2.symbol, 1460 e2.flags | e.symbol.specialFlags, e2.extra); 1461 } 1462 } 1463 else 1464 { 1465 node.edges ~= e; 1466 } 1467 } 1468 } 1469 graph = graph.makeDeterministic(graph.start); 1470 1471 foreach (res; graph.get(graph.start).results) 1472 { 1473 if (lexerGrammar.nonterminals[res.nonterminal].annotations.contains!"ignoreToken"()) 1474 continue; 1475 stderr.writeln("Warning: Token ", 1476 lexerGrammar.getSymbolName(res.nonterminal), " could be empty"); 1477 } 1478 1479 // Make sure, no token can be empty. 1480 { 1481 auto oldStart = graph.start; 1482 graph.start = graph.addNode("start"); 1483 graph.get(graph.start).edges = graph.get(oldStart).edges.dup; 1484 graph = graph.makeDeterministic(graph.start); 1485 } 1486 1487 graph = graph.minimizeDFA; 1488 1489 graph.removeMinimalMatchEdges(lexerGrammar); 1490 graph = graph.minimizeDFA; 1491 1492 if (dfafilename.length) 1493 graph.toGraphViz2!(edgeText, resultsText)(dfafilename); 1494 1495 SubAutomaton[] subAutomatons; 1496 void addSubAutomatons(G graph) 1497 { 1498 foreach (n; graph.nodeIDs) 1499 { 1500 foreach (e; graph.getEdges(n)) 1501 { 1502 if (e.symbol.recursiveSequence.length) 1503 { 1504 size_t i; 1505 for (i = 0; i < subAutomatons.length; i++) 1506 { 1507 if (subAutomatons[i].recursiveSequence == e.symbol.recursiveSequence) 1508 break; 1509 } 1510 if (i >= subAutomatons.length) 1511 { 1512 SubAutomaton a = new SubAutomaton; 1513 a.recursiveSequence = e.symbol.recursiveSequence; 1514 subAutomatons ~= a; 1515 } 1516 } 1517 } 1518 } 1519 } 1520 1521 addSubAutomatons(graph); 1522 1523 EBNFGrammar grammar2 = new EBNFGrammar(null); 1524 grammar2.nonterminals = lexerGrammar.nonterminals.dup; 1525 grammar2.tokens = lexerGrammar.tokens.dup; 1526 grammar2.productionsData = lexerGrammar.productionsData; 1527 1528 for (size_t i = 0; i < subAutomatons.length; i++) 1529 { 1530 NonterminalID n = grammar2.nonterminals.id(text("$m_", i)); 1531 grammar2.addProduction(new Production(n, subAutomatons[i].recursiveSequence)); 1532 1533 auto graph2 = genGraphForNonterminal(grammar2, codepointSets, n, graphByNonterminal); 1534 1535 subAutomatons[i].dfa = graph2; 1536 addSubAutomatons(subAutomatons[i].dfa); 1537 } 1538 1539 mixin(genCode("code", q{ 1540 // Generated with DParserGen. 1541 module $(modulename); 1542 import dparsergen.core.grammarinfo; 1543 import dparsergen.core.parseexception; 1544 import std.conv; 1545 import std.string; 1546 import std.typecons; 1547 1548 enum SymbolID startTokenID = $(lexerGrammar.startTokenID); 1549 static assert(allNonterminalTokens.length < SymbolID.max - startTokenID); 1550 enum SymbolID endTokenID = startTokenID + allNonterminalTokens.length; 1551 1552 SymbolID getTokenID(string tok) 1553 { 1554 foreach (i; 0 .. allNonterminalTokens.length) 1555 if (allNonterminalTokens[i].name == tok) 1556 return cast(SymbolID)(startTokenID + i); 1557 return SymbolID.max; 1558 } 1559 1560 struct Lexer(Location, bool includeIgnoredTokens = false) 1561 { 1562 alias LocationDiff = typeof(Location.init - Location.init); 1563 string input; 1564 this(string input, Location startLocation = Location.init) 1565 { 1566 this.input = input; 1567 this.front.currentLocation = startLocation; 1568 popFront; 1569 } 1570 1571 enum tokenID(string tok) = getTokenID(tok); 1572 string tokenName(size_t id) 1573 { 1574 return allNonterminalTokens[id - startTokenID].name; 1575 } 1576 1577 static struct Front 1578 { 1579 string content; 1580 SymbolID symbol; 1581 Location currentLocation; 1582 static if (includeIgnoredTokens) 1583 bool isIgnoreToken; 1584 1585 Location currentTokenEnd() 1586 { 1587 return currentLocation + LocationDiff.fromStr(content); 1588 } 1589 } 1590 1591 Front front; 1592 bool empty; 1593 1594 $$foreach (t; lexerGrammar.nonterminals.allIDs) { 1595 $$if (lexerGrammar.nonterminals[t].annotations.contains!"inContextOnly"()) { 1596 // InContextOnly: $(lexerGrammar.nonterminals[t].name) 1597 bool allowTokenId$(t.id); 1598 bool allowToken(string tok)() if (tok == $(lexerGrammar.nonterminals[t].name.tokenDCode)) 1599 { 1600 return allowTokenId$(t.id); 1601 } 1602 void allowToken(string tok)(bool b) if (tok == $(lexerGrammar.nonterminals[t].name.tokenDCode)) 1603 { 1604 allowTokenId$(t.id) = b; 1605 } 1606 1607 $$} 1608 $$} 1609 void popFront() 1610 { 1611 input = input[front.content.length .. $]; 1612 front.currentLocation = front.currentLocation + LocationDiff.fromStr(front.content); 1613 1614 popFrontImpl(); 1615 } 1616 1617 void popFrontImpl() 1618 { 1619 size_t foundLength; 1620 SymbolID foundSymbol = SymbolID.max; 1621 bool foundIsIgnore; 1622 1623 string inputCopy = input; 1624 1625 size_t storedStart = size_t.max; 1626 string storedString; 1627 1628 goto start; 1629 1630 $$generateStateMachine(code, lexerGrammar, graph, false, subAutomatons); 1631 lexerend: 1632 1633 if (foundSymbol != SymbolID.max) 1634 { 1635 if (foundLength == 0) 1636 { 1637 if (!inputCopy.empty) 1638 throw lexerException("no token", null, inputCopy.ptr + 1 - input.ptr); 1639 else 1640 { 1641 front.content = ""; 1642 front.symbol = SymbolID(0); 1643 static if (includeIgnoredTokens) 1644 front.isIgnoreToken = false; 1645 empty = true; 1646 return; 1647 } 1648 } 1649 static if (!includeIgnoredTokens) 1650 { 1651 if (foundIsIgnore) 1652 { 1653 front.currentLocation = front.currentLocation + LocationDiff.fromStr(input[0 .. foundLength]); 1654 input = input[foundLength .. $]; 1655 inputCopy = input; 1656 foundLength = 0; 1657 foundIsIgnore = false; 1658 foundSymbol = SymbolID.max; 1659 storedStart = size_t.max; 1660 storedString = null; 1661 goto start; 1662 } 1663 } 1664 front.content = input[0 .. foundLength]; 1665 front.symbol = foundSymbol; 1666 static if (includeIgnoredTokens) 1667 front.isIgnoreToken = foundIsIgnore; 1668 empty = false; 1669 return; 1670 } 1671 else if (input.length == 0) 1672 { 1673 front.content = ""; 1674 front.symbol = SymbolID(0); 1675 static if (includeIgnoredTokens) 1676 front.isIgnoreToken = false; 1677 empty = true; 1678 return; 1679 } 1680 else 1681 throw lexerException("no token", null, 1); 1682 } 1683 1684 $$foreach (i, a; subAutomatons) { 1685 // $(lexerGrammar.symbolInstanceToString(a.recursiveSequence)) 1686 private bool lexPart$(i)(ref string inputCopy, bool hasPreviousSymbol) 1687 { 1688 $$generateStateMachine(code, grammar2, subAutomatons[i].dfa, true, subAutomatons); 1689 assert(false); 1690 } 1691 1692 $$} 1693 SingleParseException!Location lexerException(string errorText, string expected, size_t len, 1694 string file = __FILE__, size_t line = __LINE__) 1695 { 1696 string str = errorText; 1697 if (expected.length) 1698 str ~= ", expected " ~ expected; 1699 return new SingleParseException!Location(str, front.currentLocation, front.currentLocation, file, line); 1700 } 1701 } 1702 1703 immutable allNonterminalTokens = [ 1704 $(allNonterminalsCode)]; 1705 })); 1706 1707 return code.data; 1708 }