1 2 // Copyright Tim Schendekehl 2023. 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // https://www.boost.org/LICENSE_1_0.txt) 6 7 module dparsergen.core.utils; 8 import std.algorithm; 9 import std.conv; 10 import std.range; 11 import std.string; 12 import std.typetuple; 13 import std.uni; 14 import std.utf; 15 16 /** 17 Adds elements to dst, which are not already in dst. 18 */ 19 bool addOnce(T)(ref T[] dst, T[] src) 20 { 21 bool r = false; 22 foreach (s; src) 23 { 24 if (!canFind(dst, s)) 25 { 26 dst ~= s; 27 r = true; 28 } 29 } 30 return r; 31 } 32 33 /// ditto 34 bool addOnce(T)(ref T[] dst, T s) 35 { 36 bool r = false; 37 if (!canFind(dst, s)) 38 { 39 dst ~= s; 40 r = true; 41 } 42 return r; 43 } 44 45 /** 46 Creates an alias sequence with all members of array a. 47 */ 48 template arrayToAliasSeq(alias a) 49 { 50 static if (a.length == 0) 51 alias arrayToAliasSeq = AliasSeq!(); 52 else static if (a.length == 1) 53 alias arrayToAliasSeq = AliasSeq!(a[0]); 54 else 55 alias arrayToAliasSeq = AliasSeq!(arrayToAliasSeq!(a[0 .. $ / 2]), 56 arrayToAliasSeq!(a[$ / 2 .. $])); 57 } 58 59 /** 60 Creates an alias sequence containing T if b is true. An empty alias sequence 61 is created otherwise. 62 */ 63 template AliasSeqIf(bool b, T...) 64 { 65 static if (b) 66 alias AliasSeqIf = T; 67 else 68 alias AliasSeqIf = AliasSeq!(); 69 } 70 71 private immutable char[2][] escapeSequences = [ 72 ['\\', '\\'], 73 ['\"', '\"'], 74 ['\'', '\''], 75 ['\n', 'n'], 76 ['\t', 't'], 77 ['\r', 'r'], 78 ['\f', 'f'], 79 ['\0', '0'], 80 ['\a', 'a'], 81 ['\b', 'b'], 82 ['\v', 'v'], 83 ]; 84 85 /** 86 Escapes character c for use in a D string literal. 87 88 Params: 89 c = The character to be escaped. 90 alwaysEscape = Selects if every character should be escaped. 91 */ 92 string escapeChar(dchar c, bool alwaysEscape) 93 { 94 foreach (s; escapeSequences) 95 { 96 if (c == s[0]) 97 return text("\\", s[1]); 98 } 99 if (c < 0x20 || c == 0x7f) 100 return format("\\x%02X", c); 101 if (c > 0xff && c <= 0xffff) 102 return format("\\u%04X", c); 103 if (c > 0xffff) 104 return format("\\U%08X", c); 105 if (alwaysEscape) 106 return text("\\", c); 107 else 108 return text(c); 109 } 110 111 /** 112 Escapes every character in s for use in a D string literal. 113 */ 114 string escapeD(const(char)[] s) 115 { 116 string r; 117 foreach (dchar c; s) 118 { 119 r ~= escapeChar(c, false); 120 } 121 return r; 122 } 123 124 /** 125 Generates a D string literal for s. 126 */ 127 string toDStringLiteral(const(char)[] s) 128 { 129 return text("\"", s.escapeD, "\""); 130 } 131 132 /** 133 Escape string for HTML. 134 */ 135 string escapeHTML(const(char)[] s) 136 { 137 Appender!string app; 138 foreach (char c; s) 139 { 140 if (c == '&') 141 app.put("&"); 142 else if (c == '\"') 143 app.put("""); 144 else if (c == '$') 145 app.put("$"); 146 else if (c == '<') 147 app.put("<"); 148 else if (c == '>') 149 app.put(">"); 150 else 151 app.put(c); 152 } 153 return app.data; 154 } 155 156 /** 157 Unescaped character. 158 */ 159 struct UnescapedChar 160 { 161 /** 162 The unescaped character. 163 */ 164 dchar c; 165 166 /** 167 Was the character escaped? 168 */ 169 bool isEscaped; 170 } 171 172 /** 173 Iterate string by unescaped character. 174 */ 175 auto byUnescapedDchar(R)(R range) if (isInputRange!R && is(ElementType!R == dchar)) 176 { 177 static struct X 178 { 179 R range; 180 181 UnescapedChar front; 182 183 bool empty; 184 185 void popFront() 186 { 187 if (range.empty) 188 { 189 empty = true; 190 return; 191 } 192 dchar next = range.front; 193 range.popFront; 194 bool isEscaped = false; 195 if (next == '\\') 196 { 197 isEscaped = true; 198 if (range.empty) 199 throw new Exception(text("missing escape sequence ")); 200 next = range.front; 201 range.popFront; 202 203 if (next == 'x' || next == 'u' || next == 'U') 204 { 205 ubyte len = 0; 206 if (next == 'x') 207 len = 2; 208 else if (next == 'u') 209 len = 4; 210 else if (next == 'U') 211 len = 8; 212 dchar[8] unicodeHex; 213 foreach (i; 0 .. len) 214 { 215 if (range.empty) 216 throw new Exception(text("missing hex value for \"\\", next, "\"")); 217 unicodeHex[i] = range.front; 218 range.popFront; 219 } 220 dchar[] unicodeHexRef = unicodeHex[0 .. len]; 221 next = cast(dchar) parse!uint(unicodeHexRef, 16); 222 if (len == 2 && !isValidCodepoint(cast(char) next)) 223 { 224 throw new Exception(text("Invalid UTF character \\x", unicodeHex[0 .. len])); 225 } 226 if (len == 4 && !isValidCodepoint(cast(wchar) next)) 227 { 228 throw new Exception(text("Invalid UTF character \\u", unicodeHex[0 .. len])); 229 } 230 if (len == 8 && !isValidCodepoint(cast(dchar) next)) 231 { 232 throw new Exception(text("Invalid UTF character \\U", unicodeHex[0 .. len])); 233 } 234 if (unicodeHexRef.length) 235 throw new Exception(text("escape sequence ", next, " ", 236 unicodeHex[0 .. len], "not completely parsed")); 237 } 238 else 239 { 240 foreach (s; escapeSequences) 241 { 242 if (next == s[1]) 243 next = s[0]; 244 } 245 } 246 /*else 247 throw new Exception(text("unknown escape sequence \"\\", next, "\""));*/ 248 } 249 front = UnescapedChar(next, isEscaped); 250 } 251 } 252 253 X x = X(range); 254 x.popFront; 255 return x; 256 } 257 258 /** 259 Generates sorted array of all keys in associative array. 260 */ 261 K[] sortedKeys(alias less = "a < b", K, V)(V[K] aa) 262 { 263 K[] r; 264 foreach (k, _; aa) 265 r ~= k; 266 sort!less(r); 267 return r; 268 } 269 270 /// ditto 271 const(K)[] sortedKeys(alias less = "a < b", K, V)(const V[K] aa) 272 { 273 K[] r; 274 foreach (k, _; aa) 275 r ~= k; 276 sort!less(r); 277 return r; 278 } 279 280 /** 281 Helper for list of todo items, which can be added while iterating. 282 */ 283 class TodoList(K) 284 { 285 private Appender!(K[]) app; 286 private size_t[K] byKey; 287 288 /** 289 Add item `key` if not already in list. 290 */ 291 void put(K key) 292 { 293 if (key in byKey) 294 return; 295 size_t i = app.data.length; 296 app.put(key); 297 byKey[key] = i; 298 } 299 300 /** 301 Add items `keys` if not already in list. 302 */ 303 void put(K[] keys) 304 { 305 app.reserve(app.data.length + keys.length); 306 foreach (key; keys) 307 put(key); 308 } 309 310 static struct Range 311 { 312 TodoList!K todoList; 313 size_t i; 314 bool empty() const 315 { 316 return i >= todoList.app.data.length; 317 } 318 319 K front() 320 { 321 return todoList.app.data[i]; 322 } 323 324 void popFront() 325 { 326 i++; 327 } 328 } 329 330 /** 331 Iterate over all items including items added while iterating. 332 */ 333 Range keys() 334 { 335 return Range(this, 0); 336 } 337 338 /** 339 Returns the current list of items. 340 */ 341 K[] data() 342 { 343 return app.data; 344 } 345 } 346 347 /** 348 Escape codepoint. 349 350 Params: 351 codepoint = The codepoint to be escaped. 352 isSet = Additionally escape the characters "[]^-", so it can be used 353 in a codepoint set. 354 */ 355 string escapeCodePoint(uint codepoint, bool inSet) 356 { 357 bool alwaysEscape = false; 358 359 if (inSet && (codepoint == '[' || codepoint == ']' || codepoint == '^' || codepoint == '-')) 360 alwaysEscape = true; 361 return escapeChar(cast(dchar) codepoint, alwaysEscape); 362 } 363 364 /** 365 Parse representation of codepoint set. 366 367 Params: 368 spec = Specification of the codepoint set. Can contain a list of 369 single codepoints and ranges seperated by '-'. The set can be 370 inverted by starting the specification with '^'. Characters can 371 also be escaped using '\'. 372 */ 373 CodepointSet codepointSetFromStr(string spec) 374 { 375 CodepointSet set; 376 auto chars = spec.byDchar.byUnescapedDchar; 377 378 bool invert; 379 if (!chars.empty && !chars.front.isEscaped && chars.front.c == '^') 380 { 381 invert = true; 382 chars.popFront; 383 } 384 385 while (!chars.empty) 386 { 387 dchar current = chars.front.c; 388 if (current == '-' && !chars.front.isEscaped) 389 throw new Exception("'-' at wrong place"); 390 if (current == '−') 391 throw new Exception("Unexpected '−'. Did you mean '-'?"); 392 try 393 { 394 chars.popFront; 395 } 396 catch (Exception e) 397 { 398 throw new Exception("Error parsing codepoint set \"" ~ spec ~ "\": " ~ e.msg); 399 } 400 401 if (chars.front.c == '−') 402 throw new Exception("Unexpected '−'. Did you mean '-'?"); 403 404 if (!chars.empty && !chars.front.isEscaped && chars.front.c == '-') 405 { 406 chars.popFront; 407 if (chars.empty) 408 throw new Exception("'-' at end " ~ spec); 409 if (chars.front.c == '−') 410 throw new Exception("Unexpected '−'. Did you mean '-'?"); 411 412 dchar end = chars.front.c; 413 if (end == '-' && !chars.front.isEscaped) 414 throw new Exception("'-' after another '-'" ~ " " ~ spec); 415 chars.popFront; 416 417 if (end < current) 418 throw new Exception(text("illegal interval [", spec, "] ", 419 current.escapeCodePoint(true), " - ", end.escapeCodePoint(true))); 420 421 set |= CodepointSet(current, end + 1); 422 } 423 else 424 { 425 set |= CodepointSet(current, current + 1); 426 } 427 } 428 if (invert) 429 return set.inverted; 430 else 431 return set; 432 } 433 434 /** 435 Check if character `c` belongs to the set specified by `spec`. See 436 [codepointSetFromStr] for the syntax of spec. 437 */ 438 bool inCharSet(const(char)[] spec)(dchar c) 439 { 440 mixin(() { 441 auto set = codepointSetFromStr(spec); 442 string code; 443 444 foreach (interval; set.byInterval) 445 { 446 code ~= text("if (c >= '", interval[0].escapeCodePoint(false), "' && c <= '", 447 (interval[1] - 1).escapeCodePoint(false), "') return true;\n"); 448 } 449 450 code ~= "return false;"; 451 return code; 452 }()); 453 } 454 455 /// 456 unittest 457 { 458 assert('a'.inCharSet!"a-z0-9"); 459 assert('x'.inCharSet!"a-z0-9"); 460 assert('z'.inCharSet!"a-z0-9"); 461 assert('0'.inCharSet!"a-z0-9"); 462 assert('5'.inCharSet!"a-z0-9"); 463 assert('0'.inCharSet!"a-z0-9"); 464 465 assert(!'\0'.inCharSet!"a-z0-9"); 466 assert(!'A'.inCharSet!"a-z0-9"); 467 assert(!'-'.inCharSet!"a-z0-9"); 468 469 assert(!'a'.inCharSet!"^a-z0-9"); 470 assert('#'.inCharSet!"^a-z0-9"); 471 } 472 473 string repeatChar(char c)(size_t num) 474 { 475 static Appender!(immutable(char)[]) app; 476 while (app.data.length < num) 477 app.put(c); 478 return app.data[0 .. num]; 479 } 480 481 /** 482 Allocator for array, which is used internally for temporary data. 483 */ 484 struct SimpleArrayAllocator(T, size_t blockSize = 4 * 1024 - 32) 485 { 486 enum classSize = T.sizeof; 487 enum classesPerBlock = blockSize / classSize; 488 489 private T[] data; 490 private T[][] usedData; 491 private size_t usedBlocks; 492 493 private void nextBlock() 494 { 495 if (usedBlocks < usedData.length) 496 { 497 data = usedData[usedBlocks]; 498 usedBlocks++; 499 return; 500 } 501 assert(usedBlocks == usedData.length); 502 data = new T[classesPerBlock]; 503 usedData ~= data; 504 usedBlocks++; 505 } 506 507 /** 508 Allocate array. 509 510 Params: 511 value = Initial value used for every element. 512 num = Length of array. 513 */ 514 T[] allocate(U)(U value, size_t num) 515 { 516 if (data.length < num) 517 { 518 if ((data.length < 100 || data.length < classesPerBlock / 2) && num <= classesPerBlock) 519 { 520 nextBlock(); 521 } 522 else 523 { 524 T[] r; 525 r.length = num; 526 r[] = value; 527 return r; 528 } 529 } 530 auto r = data[0 .. num]; 531 data = data[num .. $]; 532 r[] = value; 533 return r; 534 } 535 536 /** 537 Clear all referenced data, so it can be reused. 538 */ 539 void clearAll() 540 { 541 foreach (i; 0 .. usedBlocks) 542 { 543 usedData[i][] = T.init; 544 } 545 usedBlocks = 0; 546 data = []; 547 } 548 } 549 550 /** 551 Convert enum to string by interpreting the members as flags. 552 */ 553 string flagsToString(E)(E e) if (is(E == enum)) 554 { 555 string r; 556 E unknown = e; 557 string noneEntry = "0"; 558 foreach (name; __traits(allMembers, E)) 559 { 560 static if (__traits(getMember, E, name) == 0) 561 noneEntry = name; 562 else if ((e & __traits(getMember, E, name)) == __traits(getMember, E, name)) 563 { 564 if (r.length) 565 r ~= "|"; 566 r ~= name; 567 unknown &= ~__traits(getMember, E, name); 568 } 569 } 570 if (unknown) 571 { 572 if (r.length) 573 r ~= "|"; 574 r ~= text(cast(uint) unknown); 575 } 576 if (r.length == 0) 577 r = noneEntry; 578 return r; 579 } 580 581 /// 582 unittest 583 { 584 enum EnumTest 585 { 586 a = 1, 587 b = 2, 588 c = 6, 589 d = 24 590 } 591 592 assert(flagsToString(EnumTest.a) == "a"); 593 assert(flagsToString(EnumTest.b) == "b"); 594 assert(flagsToString(EnumTest.c) == "b|c"); 595 assert(flagsToString(EnumTest.d) == "d"); 596 assert(flagsToString(EnumTest.a | EnumTest.b) == "a|b"); 597 assert(flagsToString(EnumTest.a | EnumTest.c) == "a|b|c"); 598 assert(flagsToString(EnumTest.a | EnumTest.d) == "a|d"); 599 assert(flagsToString(EnumTest.b | EnumTest.d) == "b|d"); 600 assert(flagsToString(EnumTest.c | EnumTest.d) == "b|c|d"); 601 assert(flagsToString(EnumTest.a | EnumTest.c | EnumTest.d) == "a|b|c|d"); 602 assert(flagsToString(EnumTest.c & ~EnumTest.b) == "4"); 603 assert(flagsToString(cast(EnumTest) 8) == "8"); 604 assert(flagsToString(cast(EnumTest) 9) == "a|8"); 605 } 606 607 /** 608 Returns member of object, which is not null, and the default value for 609 null objects. 610 */ 611 auto memberOrDefault(string name, T)(T obj) if (is(T == class)) 612 { 613 if (obj is null) 614 return typeof({ return __traits(getMember, obj, name); }()).init; 615 else 616 return __traits(getMember, obj, name); 617 }