1 module grammardgen; 2 import dparsergen.core.dynamictree; 3 import dparsergen.core.location; 4 import dparsergen.core.nodetype; 5 import dparsergen.core.utils; 6 import std.algorithm; 7 import std.array; 8 import std.conv; 9 import std.process; 10 import std.stdio; 11 import std.string; 12 13 static import grammarddoc_lexer; 14 15 alias Tree = DynamicParseTree!(LocationAll, LocationRangeStartEnd); 16 17 void printTree(Tree tree, ref string output) 18 { 19 if (tree is null) 20 return; 21 if (tree.isToken) 22 output ~= tree.content; 23 24 foreach (c; tree.childs) 25 printTree(c, output); 26 } 27 28 struct Symbol 29 { 30 string name; 31 bool isToken; 32 bool hasOpt; 33 } 34 35 void printSymbol(Tree tree, ref Symbol[] output) 36 { 37 if (tree.name == "WS") 38 { 39 if (output[$ - 1].name.length) 40 { 41 output.length++; 42 output[$ - 1].isToken = output[$ - 2].isToken; 43 } 44 } 45 else if (tree.name == "Macro") 46 { 47 Tree[] content = tree.childs[2].childs; 48 while (content.length && content[0].name == "WS") 49 content = content[1 .. $]; 50 if (tree.childs[1].content == "D") 51 { 52 output[$ - 1].isToken = true; 53 foreach (c; content) 54 printSymbol(c, output); 55 } 56 else if (tree.childs[1].content == "B") 57 { 58 output[$ - 1].isToken = true; 59 foreach (c; content) 60 printSymbol(c, output); 61 } 62 else if (tree.childs[1].content == "D].") 63 { 64 output[$ - 1].isToken = true; 65 output[$ - 1].name ~= "]"; 66 output.length++; 67 output[$ - 1].isToken = true; 68 output[$ - 1].name ~= "."; 69 } 70 else if (tree.childs[1].content == "I") 71 { 72 foreach (c; content) 73 printSymbol(c, output); 74 } 75 else if (tree.childs[1].content == "RELATIVE_LINK2") 76 { 77 size_t paramStart = 0; 78 foreach (i, c; content) 79 if (c.name == "Comma") 80 { 81 paramStart = i + 1; 82 break; 83 } 84 foreach (c; content[paramStart .. $]) 85 printSymbol(c, output); 86 } 87 else if (tree.childs[1].content == "GLINK2") 88 { 89 size_t paramStart = 0; 90 foreach (i, c; content) 91 if (c.name == "Comma") 92 { 93 paramStart = i + 1; 94 break; 95 } 96 foreach (c; content[paramStart .. $]) 97 printSymbol(c, output); 98 } 99 else if (tree.childs[1].content == "LINK2") 100 { 101 size_t paramStart = 0; 102 foreach (i, c; content) 103 if (c.name == "Comma") 104 { 105 paramStart = i + 1; 106 break; 107 } 108 foreach (c; content[paramStart .. $]) 109 printSymbol(c, output); 110 } 111 else if (tree.childs[1].content == "GLINK") 112 { 113 size_t paramStart = 0; 114 foreach (i, c; content) 115 if (c.name == "Comma") 116 { 117 paramStart = i + 1; 118 break; 119 } 120 foreach (c; content[paramStart .. $]) 121 printSymbol(c, output); 122 } 123 else if (tree.childs[1].content == "GSELF") 124 { 125 size_t paramStart = 0; 126 foreach (i, c; content) 127 if (c.name == "Comma") 128 { 129 paramStart = i + 1; 130 break; 131 } 132 foreach (c; content[paramStart .. $]) 133 printSymbol(c, output); 134 } 135 else if (tree.childs[1].content == "DDSUBLINK") 136 { 137 size_t paramStart = 0; 138 foreach (i, c; content) 139 if (c.name == "Comma") 140 { 141 paramStart = i + 1; 142 } 143 foreach (c; content[paramStart .. $]) 144 printSymbol(c, output); 145 } 146 else if (tree.childs[1].content == "GLINK_LEX") 147 { 148 foreach (c; content[0 .. 1]) 149 printSymbol(c, output); 150 } 151 else if (tree.childs[1].content == "LPAREN") 152 { 153 output[$ - 1].name ~= "("; 154 output[$ - 1].isToken = true; 155 } 156 else if (tree.childs[1].content == "RPAREN") 157 { 158 output[$ - 1].name ~= ")"; 159 output[$ - 1].isToken = true; 160 } 161 else if (tree.childs[1].content == "CODE_LCURL") 162 { 163 output[$ - 1].name ~= "{"; 164 output[$ - 1].isToken = true; 165 } 166 else if (tree.childs[1].content == "CODE_RCURL") 167 { 168 output[$ - 1].name ~= "}"; 169 output[$ - 1].isToken = true; 170 } 171 else if (tree.childs[1].content == "CODE_PERCENT") 172 { 173 output[$ - 1].name ~= "%"; 174 output[$ - 1].isToken = true; 175 } 176 else if (tree.childs[1].content == "BACKTICK") 177 { 178 output[$ - 1].name ~= "`"; 179 output[$ - 1].isToken = true; 180 } 181 else if (tree.childs[1].content == "CODE_AMP") 182 { 183 output[$ - 1].name ~= "&"; 184 output[$ - 1].isToken = true; 185 } 186 else if (tree.childs[1].content == "AMP") 187 { 188 output[$ - 1].name ~= "&"; 189 output[$ - 1].isToken = true; 190 } 191 else if (tree.childs[1].content == "IDENTIFIER") 192 { 193 output[$ - 1].name ~= "Identifier"; 194 } 195 else if (tree.childs[1].content == "EXPRESSION") 196 { 197 output[$ - 1].name ~= "Expression"; 198 } 199 else if (tree.childs[1].content == "ASSIGNEXPRESSION") 200 { 201 output[$ - 1].name ~= "AssignExpression"; 202 } 203 else if (tree.childs[1].content == "PSCURLYSCOPE") 204 { 205 output[$ - 1].name ~= "NonEmptyOrScopeBlockStatement"; 206 } 207 else if (tree.childs[1].content == "PSSCOPE") 208 { 209 output[$ - 1].name ~= "ScopeStatement"; 210 } 211 else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE_LIST") 212 { 213 output[$ - 1].name ~= "ScopeStatementList"; 214 } 215 else if (tree.childs[1].content == "PS0") 216 { 217 output[$ - 1].name ~= "NoScopeNonEmptyStatement"; 218 } 219 else if (tree.childs[1].content == "PSSEMI") 220 { 221 output[$ - 1].name ~= "NoScopeStatement"; 222 } 223 else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE") 224 { 225 output[$ - 1].name ~= "Statement"; 226 } 227 else 228 { 229 output[$ - 1].name ~= "$(" ~ tree.childs[1].content; 230 foreach (c; content) 231 printSymbol(c, output); 232 output[$ - 1].name ~= ")"; 233 } 234 } 235 else 236 { 237 string names; 238 printTree(tree, names); 239 foreach (i, name; names.split()) 240 { 241 if (name.length >= 3 && name[0] == '`' && name[$ - 1] == '`') 242 { 243 if (output[$ - 1].name.length) 244 { 245 output.length++; 246 } 247 output[$ - 1].isToken = true; 248 name = name[1 .. $ - 1]; 249 } 250 else if (i) 251 { 252 output.length++; 253 output[$ - 1].isToken = output[$ - 2].isToken; 254 } 255 output[$ - 1].name ~= name; 256 } 257 } 258 } 259 260 class Context 261 { 262 string[string] nonterminals; 263 string[] nonterminalsOrder; 264 bool[string] tokens; 265 bool isLexer; 266 } 267 268 void analyzeNonterminal(Tree[] trees, Context context, bool isToken) 269 { 270 string name; 271 assert(trees[0].name == "Macro"); 272 assert(trees[0].childs[1].content == "GNAME"); 273 foreach (c; trees[0].childs[2].childs) 274 if (c.name == "Text") 275 name = c.childs[0].content; 276 trees = trees[1 .. $]; 277 while (true) 278 { 279 assert(trees.length); 280 if (trees[0].name == "Text" && trees[0].childs[0].content == ":") 281 { 282 trees = trees[1 .. $]; 283 } 284 else if (trees[0].name == "Macro" && trees[0].childs[1].content == "LEGACY_LNAME2") 285 { 286 trees = trees[1 .. $]; 287 } 288 else if (trees[0].name == "NL") 289 { 290 trees = trees[1 .. $]; 291 break; 292 } 293 else 294 { 295 assert(false, text(trees[0])); 296 } 297 } 298 299 Symbol[][] symbols = [[Symbol()]]; 300 301 void findSymbols(Tree[] trees) 302 { 303 foreach (i, c; trees) 304 { 305 if (c.name == "NL") 306 { 307 if (symbols[$ - 1].length > 1 || symbols[$ - 1][0].name.length) 308 { 309 symbols ~= [Symbol()]; 310 } 311 } 312 else if (c.name == "WS") 313 { 314 if (symbols[$ - 1][$ - 1].name.length) 315 symbols[$ - 1].length++; 316 } 317 else if (c.name == "Macro" && c.childs[1].content == "OPT") 318 { 319 symbols[$ - 1][$ - 1].hasOpt = true; 320 } 321 else if (c.name == "Macro" && c.childs[1].content == "LEGACY_LNAME2") 322 { 323 } 324 else if (c.name == "Macro" && c.childs[1].content == "MULTICOLS") 325 { 326 Tree[] trees2 = c.childs[2].childs; 327 while (trees2.length) 328 { 329 if (trees2[0].name == "Comma") 330 { 331 trees2 = trees2[1 .. $]; 332 break; 333 } 334 trees2 = trees2[1 .. $]; 335 } 336 findSymbols(trees2); 337 } 338 else 339 { 340 printSymbol(c, symbols[$ - 1]); 341 } 342 } 343 } 344 345 findSymbols(trees); 346 347 foreach (ref output; symbols) 348 { 349 if (output[$ - 1].name.length == 0) 350 output.length--; 351 if (output.length && output[$ - 1].name.startsWith("(") && output[$ - 1].name.endsWith(")")) 352 output.length--; 353 } 354 if (symbols[$ - 1].length == 0) 355 symbols.length--; 356 357 if (name == "Register" || name == "Register64") 358 { 359 Symbol[][] symbolsBak = symbols; 360 symbols = []; 361 foreach (output; symbolsBak) 362 { 363 foreach (s; output) 364 symbols ~= [s]; 365 } 366 } 367 368 string[][string] tokensToSplit = [ 369 "ST(0)": ["ST", "(", "0", ")"], 370 "ST(1)": ["ST", "(", "1", ")"], 371 "ST(2)": ["ST", "(", "2", ")"], 372 "ST(3)": ["ST", "(", "3", ")"], 373 "ST(4)": ["ST", "(", "4", ")"], 374 "ST(5)": ["ST", "(", "5", ")"], 375 "ST(6)": ["ST", "(", "6", ")"], 376 "ST(7)": ["ST", "(", "7", ")"], 377 "!is": ["!", "is"], 378 "!in": ["!", "in"], 379 ");": [")", ";"], 380 "scope(success)": ["scope", "(", "success", ")"], 381 "scope(exit)": ["scope", "(", "exit", ")"], 382 "scope(failure)": ["scope", "(", "failure", ")"], 383 "C++": ["C", "++"], 384 "C++,": ["C", "++", ","], 385 "Objective - C": ["Objective", "-", "C"], 386 "( )": ["(", ")"], 387 ]; 388 foreach (i, ref output; symbols) 389 { 390 Symbol[] output2; 391 foreach (s; output) 392 { 393 if (s.isToken && s.name in tokensToSplit) 394 { 395 assert(!s.hasOpt); 396 foreach (x; tokensToSplit[s.name]) 397 { 398 output2 ~= Symbol(x, true); 399 } 400 } 401 else 402 output2 ~= s; 403 } 404 output = output2; 405 } 406 407 if (name.endsWith("String")) 408 isToken = true; 409 if (name == "Token" || name == "Keyword" || name == "StringLiteral" || name == "TokenString") 410 isToken = false; 411 string code; 412 if (isToken) 413 code ~= "token "; 414 code ~= name; 415 if (name.endsWith("Comment") || name == "SpecialTokenSequence" 416 || name == "EndOfLine" || name == "WhiteSpace") 417 code ~= " @IgnoreToken"; 418 if (name == "Module") 419 code ~= " @Start"; 420 if (name == "Identifier") 421 code ~= " @LowPrio"; 422 if (isToken) 423 context.tokens[name] = true; 424 code ~= "\n"; 425 foreach (i, ref output; symbols) 426 { 427 if (name == "ParameterAttributes" && output.length == 1 428 && output[0].name == "ParameterAttributes") 429 continue; 430 431 if (name == "NestingBlockComment") 432 { 433 output.length++; 434 output[$ - 1] = output[$ - 2]; 435 output[$ - 2].isToken = false; 436 output[$ - 2].name = "/(+*)/"; 437 } 438 if (name == "NestingBlockCommentCharacter" && output[0].name == "NestingBlockComment") 439 { 440 output.length++; 441 output[1] = output[0]; 442 output[0].isToken = false; 443 output[0].name = "/(\\/*)/"; 444 } 445 if (name == "NestingBlockCommentCharacters" && output.length == 1) 446 { 447 output[0].isToken = false; 448 output[0].name = "eps"; 449 } 450 if ((name == "DoubleQuotedCharacters" || name == "WysiwygCharacters" 451 || name == "HexStringChars") && output.length == 1) 452 { 453 output[0].isToken = false; 454 output[0].name = "eps"; 455 } 456 457 if (i) 458 code ~= " |"; 459 else 460 code ~= " ="; 461 foreach (ref s; output) 462 { 463 if (s.name == ".." || s.name == "," || s.name == "=") 464 s.isToken = true; 465 if (name == "TraitsKeyword") 466 s.isToken = true; 467 468 if (name == "BlockComment" && s.name == "Characters") 469 { 470 s.isToken = false; 471 s.name = "/(([^*]|[*][*]*[^\\/*])*[*]*)/"; 472 } 473 if (name == "LineComment" && s.name == "Characters") 474 { 475 s.isToken = false; 476 s.name = "/([^\\n\\r\\u000D\\u000A\\u2028\\u2029\\0\\x1a]*)/"; 477 } 478 if (name == "NestingBlockCommentCharacter" && s.name == "Character") 479 { 480 s.isToken = false; 481 s.name = "/[^+\\/]|++*[^+\\/]|\\/\\/*[^+\\/]/"; 482 } 483 if (name == "DoubleQuotedCharacter" && s.name == "Character") 484 { 485 s.isToken = false; 486 s.name = "/[^\\\"\\\\]/"; 487 } 488 if (name == "SingleQuotedCharacter" && s.name == "Character") 489 { 490 s.isToken = false; 491 s.name = "/[^\\\'\\\\]/"; 492 } 493 if (name == "FloatLiteral" && s.name == "Integer") 494 { 495 s.isToken = false; 496 s.name = "DecimalInteger"; // TODO: only this? 497 } 498 if (name == "DeclDefs" && output.length == 2) 499 { 500 if (s.name == "DeclDefs") 501 s.name = "DeclDef"; 502 else 503 s.name = "DeclDefs"; 504 } 505 506 if (s.isToken) 507 { 508 if (!context.isLexer && s.name.length == 1 && s.name[0] >= '0' && s.name[0] <= '9') 509 { 510 code ~= " IntegerLiteral>>\"" ~ s.name ~ "\""; 511 } 512 else 513 { 514 string tname = s.name; 515 if (tname.length == 6 && tname.startsWith("\\u")) 516 { 517 } 518 else 519 tname = tname.escapeD; 520 code ~= " \"" ~ tname ~ "\""; 521 } 522 } 523 else 524 { 525 if (!context.isLexer && output.length == 1 && s.name[0] != '/' 526 && s.name != "eps" && s.name !in context.tokens) 527 code ~= " <" ~ s.name; 528 else 529 code ~= " " ~ s.name; 530 } 531 if (s.hasOpt) 532 code ~= "?"; 533 } 534 code ~= "\n"; 535 } 536 code ~= " ;\n"; 537 if (name in context.nonterminals) 538 { 539 //assert(context.nonterminals[name] == code, text(code, "=================\n", context.nonterminals[name])); 540 } 541 else 542 context.nonterminalsOrder ~= name; 543 context.nonterminals[name] = code; 544 } 545 546 void analyzeGrammar(Tree tree, Context context) 547 { 548 if (tree is null) 549 return; 550 size_t start = size_t.max; 551 bool isToken = context.isLexer; 552 foreach (i, c; tree.childs[2].childs) 553 { 554 if (c.name == "Macro" && c.childs[1].content == "GNAME") 555 { 556 if (start != size_t.max) 557 { 558 analyzeNonterminal(tree.childs[2].childs[start .. i], context, isToken); 559 isToken = false; 560 } 561 start = i; 562 } 563 } 564 if (start != size_t.max) 565 analyzeNonterminal(tree.childs[2].childs[start .. $], context, isToken); 566 } 567 568 void findGrammar(Tree tree, Context context) 569 { 570 if (tree is null) 571 return; 572 if (tree.nodeType == NodeType.nonterminal && tree.name == "Macro" 573 && tree.childs[1].content.among("GRAMMAR", "GRAMMAR_LEX")) 574 { 575 analyzeGrammar(tree, context); 576 } 577 else 578 { 579 foreach (c; tree.childs) 580 findGrammar(c, context); 581 } 582 } 583 584 int main(string[] args) 585 { 586 import P = grammarddoc; 587 import std.file; 588 import std.path; 589 import std.stdio; 590 591 alias L = grammarddoc_lexer.Lexer!LocationAll; 592 alias Creator = DynamicParseTreeCreator!(P, LocationAll, LocationRangeStartEnd); 593 Creator creator = new Creator; 594 595 if (args.length != 4) 596 { 597 stderr.writeln("Usage: grammarcppgen dlang.org grammard.ebnf grammardlex.ebnf"); 598 return 1; 599 } 600 string dlangRepo = args[1]; 601 602 auto git = execute(["git", "-C", dlangRepo, "rev-parse", "HEAD"]); 603 604 Context contextLex = new Context(); 605 contextLex.isLexer = true; 606 foreach (f; ["lex"]) 607 { 608 string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd"; 609 610 string inText = readText(filename); 611 612 auto tree = P.parse!(Creator, L)(inText, creator); 613 assert(tree.inputLength.bytePos <= inText.length); 614 615 findGrammar(tree, contextLex); 616 File of = File(args[3], "w"); 617 if (git.status == 0) 618 of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n"); 619 foreach (name; contextLex.nonterminalsOrder) 620 { 621 of.write(contextLex.nonterminals[name]); 622 } 623 of.writeln("Letter = [a-zA-Z];"); 624 of.writeln("Tokens @Array = eps | Tokens Token;"); 625 } 626 627 Context context = new Context(); 628 context.tokens = contextLex.tokens; 629 630 foreach (f; [ 631 "module", "expression", "declaration", "iasm", "attribute", 632 "statement", "template", "class", "traits", "function", "struct", 633 "unittest", "version", "template-mixin", "enum", "pragma", "interface", 634 "type" 635 ]) 636 { 637 string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd"; 638 639 string inText = readText(filename); 640 641 auto tree = P.parse!(Creator, L)(inText, creator); 642 assert(tree.inputLength.bytePos <= inText.length); 643 644 findGrammar(tree, context); 645 } 646 foreach (name; [ 647 "ParameterMemberAttributes", "FunctionAttributes", "TypeVector", 648 "Opcode" 649 ]) 650 { 651 if (name in context.nonterminals) 652 continue; 653 context.nonterminals[name] = name ~ " = \"TODO\";\n"; 654 context.nonterminalsOrder ~= name; 655 } 656 File of = File(args[2], "w"); 657 if (git.status == 0) 658 of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n"); 659 of.writeln("import \"grammardlex.ebnf\";"); 660 foreach (name; context.nonterminalsOrder) 661 { 662 of.write(context.nonterminals[name]); 663 } 664 return 0; 665 }