1 module grammardgen; 2 import dparsergen.core.dynamictree; 3 import dparsergen.core.location; 4 import dparsergen.core.nodetype; 5 import dparsergen.core.utils; 6 import std.algorithm; 7 import std.array; 8 import std.conv; 9 import std.process; 10 import std.stdio; 11 import std.string; 12 13 static import grammarddoc_lexer; 14 15 alias Tree = DynamicParseTree!(LocationAll, LocationRangeStartEnd); 16 17 void printTree(Tree tree, ref string output) 18 { 19 if (tree is null) 20 return; 21 if (tree.isToken) 22 output ~= tree.content; 23 24 foreach (c; tree.childs) 25 printTree(c, output); 26 } 27 28 struct Symbol 29 { 30 string name; 31 bool isToken; 32 bool hasOpt; 33 } 34 35 void printSymbol(Tree tree, ref Symbol[] output) 36 { 37 if (tree.name == "WS") 38 { 39 if (output[$ - 1].name.length) 40 { 41 output.length++; 42 output[$ - 1].isToken = output[$ - 2].isToken; 43 } 44 } 45 else if (tree.name == "Macro") 46 { 47 Tree[] content = tree.childs[2].childs; 48 while (content.length && content[0].name == "WS") 49 content = content[1 .. $]; 50 if (tree.childs[1].content == "D") 51 { 52 output[$ - 1].isToken = true; 53 foreach (c; content) 54 printSymbol(c, output); 55 } 56 else if (tree.childs[1].content == "B") 57 { 58 output[$ - 1].isToken = true; 59 foreach (c; content) 60 printSymbol(c, output); 61 } 62 else if (tree.childs[1].content == "D].") 63 { 64 output[$ - 1].isToken = true; 65 output[$ - 1].name ~= "]"; 66 output.length++; 67 output[$ - 1].isToken = true; 68 output[$ - 1].name ~= "."; 69 } 70 else if (tree.childs[1].content == "I") 71 { 72 foreach (c; content) 73 printSymbol(c, output); 74 } 75 else if (tree.childs[1].content == "RELATIVE_LINK2") 76 { 77 size_t paramStart = 0; 78 foreach (i, c; content) 79 if (c.name == "Comma") 80 { 81 paramStart = i + 1; 82 break; 83 } 84 foreach (c; content[paramStart .. $]) 85 printSymbol(c, output); 86 } 87 else if (tree.childs[1].content == "GLINK2") 88 { 89 size_t paramStart = 0; 90 foreach (i, c; content) 91 if (c.name == "Comma") 92 { 93 paramStart = i + 1; 94 break; 95 } 96 foreach (c; content[paramStart .. $]) 97 printSymbol(c, output); 98 } 99 else if (tree.childs[1].content == "LINK2") 100 { 101 size_t paramStart = 0; 102 foreach (i, c; content) 103 if (c.name == "Comma") 104 { 105 paramStart = i + 1; 106 break; 107 } 108 foreach (c; content[paramStart .. $]) 109 printSymbol(c, output); 110 } 111 else if (tree.childs[1].content == "GLINK") 112 { 113 size_t paramStart = 0; 114 foreach (i, c; content) 115 if (c.name == "Comma") 116 { 117 paramStart = i + 1; 118 break; 119 } 120 foreach (c; content[paramStart .. $]) 121 printSymbol(c, output); 122 } 123 else if (tree.childs[1].content == "GSELF") 124 { 125 size_t paramStart = 0; 126 foreach (i, c; content) 127 if (c.name == "Comma") 128 { 129 paramStart = i + 1; 130 break; 131 } 132 foreach (c; content[paramStart .. $]) 133 printSymbol(c, output); 134 } 135 else if (tree.childs[1].content == "DDSUBLINK") 136 { 137 size_t paramStart = 0; 138 foreach (i, c; content) 139 if (c.name == "Comma") 140 { 141 paramStart = i + 1; 142 } 143 foreach (c; content[paramStart .. $]) 144 printSymbol(c, output); 145 } 146 else if (tree.childs[1].content == "GLINK_LEX") 147 { 148 foreach (c; content[0 .. 1]) 149 printSymbol(c, output); 150 } 151 else if (tree.childs[1].content == "LPAREN") 152 { 153 output[$ - 1].name ~= "("; 154 output[$ - 1].isToken = true; 155 } 156 else if (tree.childs[1].content == "RPAREN") 157 { 158 output[$ - 1].name ~= ")"; 159 output[$ - 1].isToken = true; 160 } 161 else if (tree.childs[1].content == "CODE_LCURL") 162 { 163 output[$ - 1].name ~= "{"; 164 output[$ - 1].isToken = true; 165 } 166 else if (tree.childs[1].content == "CODE_RCURL") 167 { 168 output[$ - 1].name ~= "}"; 169 output[$ - 1].isToken = true; 170 } 171 else if (tree.childs[1].content == "CODE_PERCENT") 172 { 173 output[$ - 1].name ~= "%"; 174 output[$ - 1].isToken = true; 175 } 176 else if (tree.childs[1].content == "BACKTICK") 177 { 178 output[$ - 1].name ~= "`"; 179 output[$ - 1].isToken = true; 180 } 181 else if (tree.childs[1].content == "CODE_AMP") 182 { 183 output[$ - 1].name ~= "&"; 184 output[$ - 1].isToken = true; 185 } 186 else if (tree.childs[1].content == "AMP") 187 { 188 output[$ - 1].name ~= "&"; 189 output[$ - 1].isToken = true; 190 } 191 else if (tree.childs[1].content == "IDENTIFIER") 192 { 193 output[$ - 1].name ~= "Identifier"; 194 } 195 else if (tree.childs[1].content == "EXPRESSION") 196 { 197 output[$ - 1].name ~= "Expression"; 198 } 199 else if (tree.childs[1].content == "ASSIGNEXPRESSION") 200 { 201 output[$ - 1].name ~= "AssignExpression"; 202 } 203 else if (tree.childs[1].content == "PSCURLYSCOPE") 204 { 205 output[$ - 1].name ~= "NonEmptyOrScopeBlockStatement"; 206 } 207 else if (tree.childs[1].content == "PSSCOPE") 208 { 209 output[$ - 1].name ~= "ScopeStatement"; 210 } 211 else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE_LIST") 212 { 213 output[$ - 1].name ~= "ScopeStatementList"; 214 } 215 else if (tree.childs[1].content == "PS0") 216 { 217 output[$ - 1].name ~= "NoScopeNonEmptyStatement"; 218 } 219 else if (tree.childs[1].content == "PSSEMI") 220 { 221 output[$ - 1].name ~= "NoScopeStatement"; 222 } 223 else if (tree.childs[1].content == "PSSEMI_PSCURLYSCOPE") 224 { 225 output[$ - 1].name ~= "Statement"; 226 } 227 else 228 { 229 output[$ - 1].name ~= "$(" ~ tree.childs[1].content; 230 foreach (c; content) 231 printSymbol(c, output); 232 output[$ - 1].name ~= ")"; 233 } 234 } 235 else 236 { 237 string names; 238 printTree(tree, names); 239 foreach (i, name; names.split()) 240 { 241 if (name.length >= 3 && name[0] == '`' && name[$ - 1] == '`') 242 { 243 if (output[$ - 1].name.length) 244 { 245 output.length++; 246 } 247 output[$ - 1].isToken = true; 248 name = name[1 .. $ - 1]; 249 } 250 else if (i) 251 { 252 output.length++; 253 output[$ - 1].isToken = output[$ - 2].isToken; 254 } 255 output[$ - 1].name ~= name; 256 } 257 } 258 } 259 260 class Context 261 { 262 string[string] nonterminals; 263 string[] nonterminalsOrder; 264 bool[string] tokens; 265 bool isLexer; 266 } 267 268 bool isArrayNonterminal(string name) 269 { 270 bool isArray; 271 if (name.endsWith("List") || name.endsWith("Array") || name.endsWith("Attributes") || name.endsWith("Empty")) 272 isArray = true; 273 if (name.among("AliasAssignments", "AnonymousEnumMembers", "ArrayMemberInitializations", "AttributesNoPragma", "AutoAssignments", "Declarators", "DeclDefs", "EnumMembers", "FunctionContracts", "FunctionContractsEndingInOutContractExpression", "FunctionContractsEndingInOutStatement", "InOutContractExpressions", "Interfaces", "KeyValuePairs", "Slice", "Slice2", "StatementListNoCaseNoDefault", "StorageClasses", "StorageClassesAttributesNoPragma", "StructMemberInitializers", "TraitsArguments", "TypeCtors")) 274 isArray = true; 275 return isArray; 276 } 277 278 void analyzeNonterminal(Tree[] trees, Context context, bool isLexer, bool isToken) 279 { 280 string name; 281 assert(trees[0].name == "Macro"); 282 assert(trees[0].childs[1].content == "GNAME"); 283 foreach (c; trees[0].childs[2].childs) 284 if (c.name == "Text") 285 name = c.childs[0].content; 286 trees = trees[1 .. $]; 287 while (true) 288 { 289 assert(trees.length); 290 if (trees[0].name == "Text" && trees[0].childs[0].content == ":") 291 { 292 trees = trees[1 .. $]; 293 } 294 else if (trees[0].name == "Macro" && trees[0].childs[1].content == "LEGACY_LNAME2") 295 { 296 trees = trees[1 .. $]; 297 } 298 else if (trees[0].name == "NL") 299 { 300 trees = trees[1 .. $]; 301 break; 302 } 303 else 304 { 305 assert(false, text(trees[0])); 306 } 307 } 308 309 Symbol[][] symbols = [[Symbol()]]; 310 311 void findSymbols(Tree[] trees) 312 { 313 foreach (i, c; trees) 314 { 315 if (c.name == "NL") 316 { 317 if (symbols[$ - 1].length > 1 || symbols[$ - 1][0].name.length) 318 { 319 symbols ~= [Symbol()]; 320 } 321 } 322 else if (c.name == "WS") 323 { 324 if (symbols[$ - 1][$ - 1].name.length) 325 symbols[$ - 1].length++; 326 } 327 else if (c.name == "Macro" && c.childs[1].content == "OPT") 328 { 329 symbols[$ - 1][$ - 1].hasOpt = true; 330 } 331 else if (c.name == "Macro" && c.childs[1].content == "LEGACY_LNAME2") 332 { 333 } 334 else if (c.name == "Macro" && c.childs[1].content == "MULTICOLS") 335 { 336 Tree[] trees2 = c.childs[2].childs; 337 while (trees2.length) 338 { 339 if (trees2[0].name == "Comma") 340 { 341 trees2 = trees2[1 .. $]; 342 break; 343 } 344 trees2 = trees2[1 .. $]; 345 } 346 findSymbols(trees2); 347 } 348 else 349 { 350 printSymbol(c, symbols[$ - 1]); 351 } 352 } 353 } 354 355 findSymbols(trees); 356 357 foreach (ref output; symbols) 358 { 359 if (output[$ - 1].name.length == 0) 360 output.length--; 361 if (output.length && output[$ - 1].name.startsWith("(") && output[$ - 1].name.endsWith(")")) 362 output.length--; 363 } 364 if (symbols[$ - 1].length == 0) 365 symbols.length--; 366 367 if (name == "Register" || name == "Register64") 368 { 369 Symbol[][] symbolsBak = symbols; 370 symbols = []; 371 foreach (output; symbolsBak) 372 { 373 foreach (s; output) 374 symbols ~= [s]; 375 } 376 } 377 378 string[][string] tokensToSplit = [ 379 "ST(0)": ["ST", "(", "0", ")"], 380 "ST(1)": ["ST", "(", "1", ")"], 381 "ST(2)": ["ST", "(", "2", ")"], 382 "ST(3)": ["ST", "(", "3", ")"], 383 "ST(4)": ["ST", "(", "4", ")"], 384 "ST(5)": ["ST", "(", "5", ")"], 385 "ST(6)": ["ST", "(", "6", ")"], 386 "ST(7)": ["ST", "(", "7", ")"], 387 "!is": ["!", "is"], 388 "!in": ["!", "in"], 389 ");": [")", ";"], 390 "scope(success)": ["scope", "(", "success", ")"], 391 "scope(exit)": ["scope", "(", "exit", ")"], 392 "scope(failure)": ["scope", "(", "failure", ")"], 393 "C++": ["C", "++"], 394 "C++,": ["C", "++", ","], 395 "Objective - C": ["Objective", "-", "C"], 396 "( )": ["(", ")"], 397 ]; 398 foreach (i, ref output; symbols) 399 { 400 Symbol[] output2; 401 foreach (s; output) 402 { 403 if (s.isToken && s.name in tokensToSplit) 404 { 405 assert(!s.hasOpt); 406 foreach (x; tokensToSplit[s.name]) 407 { 408 output2 ~= Symbol(x, true); 409 } 410 } 411 else 412 output2 ~= s; 413 } 414 output = output2; 415 } 416 417 bool isNonterminal; 418 if (name.endsWith("String")) 419 isToken = true; 420 if (name.among("Token", "Keyword", "StringLiteral", "TokenString", "SourceFile")) 421 { 422 isToken = false; 423 isNonterminal = true; 424 } 425 bool isArray = isArrayNonterminal(name); 426 string code; 427 if (isToken) 428 code ~= "token "; 429 else if (isLexer && !isNonterminal) 430 code ~= "fragment "; 431 code ~= name; 432 if (name.endsWith("Comment") || name == "SpecialTokenSequence" 433 || name == "EndOfLine" || name == "WhiteSpace") 434 code ~= " @ignoreToken"; 435 if (name == "SourceFile") 436 code ~= " @start"; 437 if (name == "Identifier") 438 code ~= " @lowPrio"; 439 if (isArray) 440 code ~= " @array @regArray"; 441 if (isToken) 442 context.tokens[name] = true; 443 code ~= "\n"; 444 foreach (i, ref output; symbols) 445 { 446 if (name == "ParameterAttributes" && output.length == 1 447 && output[0].name == "ParameterAttributes") 448 continue; 449 450 if (i) 451 code ~= " |"; 452 else 453 code ~= " ="; 454 foreach (ref s; output) 455 { 456 if (s.name == ".." || s.name == "," || s.name == "=") 457 s.isToken = true; 458 if (name == "TraitsKeyword") 459 s.isToken = true; 460 461 if (s.isToken) 462 { 463 if (!context.isLexer && s.name.length == 1 && s.name[0] >= '0' && s.name[0] <= '9') 464 { 465 code ~= " IntegerLiteral>>\"" ~ s.name ~ "\""; 466 } 467 else 468 { 469 string tname = s.name; 470 if (tname.length == 6 && tname.startsWith("\\u")) 471 { 472 } 473 else 474 tname = tname.escapeD; 475 code ~= " \"" ~ tname ~ "\""; 476 } 477 } 478 else 479 { 480 if (!context.isLexer && output.length == 1 && s.name[0] != '/' 481 && s.name != "@empty" && s.name !in context.tokens && !isArray && !isArrayNonterminal(s.name)) 482 code ~= " <" ~ s.name; 483 else 484 code ~= " " ~ s.name; 485 } 486 if (s.hasOpt) 487 code ~= "?"; 488 } 489 code ~= "\n"; 490 } 491 code ~= " ;\n"; 492 if (name in context.nonterminals) 493 { 494 //assert(context.nonterminals[name] == code, text(code, "=================\n", context.nonterminals[name])); 495 } 496 else 497 context.nonterminalsOrder ~= name; 498 context.nonterminals[name] = code; 499 } 500 501 void analyzeGrammar(Tree tree, Context context) 502 { 503 if (tree is null) 504 return; 505 size_t start = size_t.max; 506 bool isToken = context.isLexer; 507 foreach (i, c; tree.childs[2].childs) 508 { 509 if (c.name == "Macro" && c.childs[1].content == "GNAME") 510 { 511 if (start != size_t.max) 512 { 513 analyzeNonterminal(tree.childs[2].childs[start .. i], context, context.isLexer, isToken); 514 isToken = false; 515 } 516 start = i; 517 } 518 } 519 if (start != size_t.max) 520 analyzeNonterminal(tree.childs[2].childs[start .. $], context, context.isLexer, isToken); 521 } 522 523 void findGrammar(Tree tree, Context context) 524 { 525 if (tree is null) 526 return; 527 if (tree.nodeType == NodeType.nonterminal && tree.name == "Macro" 528 && tree.childs[1].content.among("GRAMMAR", "GRAMMAR_LEX")) 529 { 530 analyzeGrammar(tree, context); 531 } 532 else 533 { 534 foreach (c; tree.childs) 535 findGrammar(c, context); 536 } 537 } 538 539 int main(string[] args) 540 { 541 import P = grammarddoc; 542 import std.file; 543 import std.path; 544 import std.stdio; 545 546 alias L = grammarddoc_lexer.Lexer!LocationAll; 547 alias Creator = DynamicParseTreeCreator!(P, LocationAll, LocationRangeStartEnd); 548 Creator creator = new Creator; 549 550 if (args.length != 4) 551 { 552 stderr.writeln("Usage: grammarcppgen dlang.org grammard.ebnf grammardlex.ebnf"); 553 return 1; 554 } 555 string dlangRepo = args[1]; 556 557 auto git = execute(["git", "-C", dlangRepo, "rev-parse", "HEAD"]); 558 559 Context contextLex = new Context(); 560 contextLex.isLexer = true; 561 foreach (f; ["lex"]) 562 { 563 string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd"; 564 565 string inText = readText(filename); 566 567 auto tree = P.parse!(Creator, L)(inText, creator); 568 assert(tree.inputLength.bytePos <= inText.length); 569 570 findGrammar(tree, contextLex); 571 } 572 573 Context context = new Context(); 574 context.tokens = contextLex.tokens; 575 576 foreach (f; [ 577 "module", "expression", "declaration", "iasm", "attribute", 578 "statement", "template", "class", "traits", "function", "struct", 579 "unittest", "version", "template-mixin", "enum", "pragma", "interface", 580 "type" 581 ]) 582 { 583 string filename = dlangRepo ~ "/spec/" ~ f ~ ".dd"; 584 585 string inText = readText(filename); 586 587 auto tree = P.parse!(Creator, L)(inText, creator); 588 assert(tree.inputLength.bytePos <= inText.length); 589 590 findGrammar(tree, context); 591 } 592 foreach (name; [ 593 "ParameterMemberAttributes", "FunctionAttributes", "TypeVector", 594 "Opcode" 595 ]) 596 { 597 if (name in context.nonterminals) 598 continue; 599 context.nonterminals[name] = name ~ " = \"TODO\";\n"; 600 context.nonterminalsOrder ~= name; 601 } 602 603 File of = File(args[3], "w"); 604 if (git.status == 0) 605 of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n"); 606 foreach (name; contextLex.nonterminalsOrder) 607 { 608 if (name == "SourceFile") 609 continue; 610 of.write(contextLex.nonterminals[name]); 611 } 612 of.writeln("Letter = [a-zA-Z];"); 613 of.writeln("Tokens @array = @empty | Tokens Token;"); 614 615 of = File(args[2], "w"); 616 if (git.status == 0) 617 of.writeln("// Based on grammar from dlang.org commit ", git.output.strip(), "\n"); 618 of.writeln("import \"grammardlex.ebnf\";"); 619 of.write(contextLex.nonterminals["SourceFile"]); 620 foreach (name; context.nonterminalsOrder) 621 { 622 of.write(context.nonterminals[name]); 623 } 624 625 return 0; 626 }