1 
2 //          Copyright Tim Schendekehl 2023.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          https://www.boost.org/LICENSE_1_0.txt)
6 
7 module dparsergen.core.utils;
8 import std.algorithm;
9 import std.conv;
10 import std.range;
11 import std.string;
12 import std.typetuple;
13 import std.uni;
14 import std.utf;
15 
16 /**
17 Adds elements to dst, which are not already in dst.
18 */
19 bool addOnce(T)(ref T[] dst, T[] src)
20 {
21     bool r = false;
22     foreach (s; src)
23     {
24         if (!canFind(dst, s))
25         {
26             dst ~= s;
27             r = true;
28         }
29     }
30     return r;
31 }
32 
33 /// ditto
34 bool addOnce(T)(ref T[] dst, T s)
35 {
36     bool r = false;
37     if (!canFind(dst, s))
38     {
39         dst ~= s;
40         r = true;
41     }
42     return r;
43 }
44 
45 /**
46 Creates an alias sequence with all members of array a.
47 */
48 template arrayToAliasSeq(alias a)
49 {
50     static if (a.length == 0)
51         alias arrayToAliasSeq = AliasSeq!();
52     else static if (a.length == 1)
53         alias arrayToAliasSeq = AliasSeq!(a[0]);
54     else
55         alias arrayToAliasSeq = AliasSeq!(arrayToAliasSeq!(a[0 .. $ / 2]),
56                 arrayToAliasSeq!(a[$ / 2 .. $]));
57 }
58 
59 /**
60 Creates an alias sequence containing T if b is true. An empty alias sequence
61 is created otherwise.
62 */
63 template AliasSeqIf(bool b, T...)
64 {
65     static if (b)
66         alias AliasSeqIf = T;
67     else
68         alias AliasSeqIf = AliasSeq!();
69 }
70 
71 private immutable char[2][] escapeSequences = [
72     ['\\', '\\'],
73     ['\"', '\"'],
74     ['\'', '\''],
75     ['\n', 'n'],
76     ['\t', 't'],
77     ['\r', 'r'],
78     ['\f', 'f'],
79     ['\0', '0'],
80     ['\a', 'a'],
81     ['\b', 'b'],
82     ['\v', 'v'],
83 ];
84 
85 /**
86 Escapes character c for use in a D string literal.
87 
88 Params:
89     c = The character to be escaped.
90     alwaysEscape = Selects if every character should be escaped.
91 */
92 string escapeChar(dchar c, bool alwaysEscape)
93 {
94     foreach (s; escapeSequences)
95     {
96         if (c == s[0])
97             return text("\\", s[1]);
98     }
99     if (c < 0x20 || c == 0x7f)
100         return format("\\x%02X", c);
101     if (c > 0xff && c <= 0xffff)
102         return format("\\u%04X", c);
103     if (c > 0xffff)
104         return format("\\U%08X", c);
105     if (alwaysEscape)
106         return text("\\", c);
107     else
108         return text(c);
109 }
110 
111 /**
112 Escapes every character in s for use in a D string literal.
113 */
114 string escapeD(const(char)[] s)
115 {
116     string r;
117     foreach (dchar c; s)
118     {
119         r ~= escapeChar(c, false);
120     }
121     return r;
122 }
123 
124 /**
125 Generates a D string literal for s.
126 */
127 string toDStringLiteral(const(char)[] s)
128 {
129     return text("\"", s.escapeD, "\"");
130 }
131 
132 /**
133 Escape string for HTML.
134 */
135 string escapeHTML(const(char)[] s)
136 {
137     Appender!string app;
138     foreach (char c; s)
139     {
140         if (c == '&')
141             app.put("&amp;");
142         else if (c == '\"')
143             app.put("&quot;");
144         else if (c == '$')
145             app.put("&#36;");
146         else if (c == '<')
147             app.put("&lt;");
148         else if (c == '>')
149             app.put("&gt;");
150         else
151             app.put(c);
152     }
153     return app.data;
154 }
155 
156 /**
157 Unescaped character.
158 */
159 struct UnescapedChar
160 {
161     /**
162     The unescaped character.
163     */
164     dchar c;
165 
166     /**
167     Was the character escaped?
168     */
169     bool isEscaped;
170 }
171 
172 /**
173 Iterate string by unescaped character.
174 */
175 auto byUnescapedDchar(R)(R range) if (isInputRange!R && is(ElementType!R == dchar))
176 {
177     static struct X
178     {
179         R range;
180 
181         UnescapedChar front;
182 
183         bool empty;
184 
185         void popFront()
186         {
187             if (range.empty)
188             {
189                 empty = true;
190                 return;
191             }
192             dchar next = range.front;
193             range.popFront;
194             bool isEscaped = false;
195             if (next == '\\')
196             {
197                 isEscaped = true;
198                 if (range.empty)
199                     throw new Exception(text("missing escape sequence "));
200                 next = range.front;
201                 range.popFront;
202 
203                 if (next == 'x' || next == 'u' || next == 'U')
204                 {
205                     ubyte len = 0;
206                     if (next == 'x')
207                         len = 2;
208                     else if (next == 'u')
209                         len = 4;
210                     else if (next == 'U')
211                         len = 8;
212                     dchar[8] unicodeHex;
213                     foreach (i; 0 .. len)
214                     {
215                         if (range.empty)
216                             throw new Exception(text("missing hex value for \"\\", next, "\""));
217                         unicodeHex[i] = range.front;
218                         range.popFront;
219                     }
220                     dchar[] unicodeHexRef = unicodeHex[0 .. len];
221                     next = cast(dchar) parse!uint(unicodeHexRef, 16);
222                     if (len == 2 && !isValidCodepoint(cast(char) next))
223                     {
224                         throw new Exception(text("Invalid UTF character \\x", unicodeHex[0 .. len]));
225                     }
226                     if (len == 4 && !isValidCodepoint(cast(wchar) next))
227                     {
228                         throw new Exception(text("Invalid UTF character \\u", unicodeHex[0 .. len]));
229                     }
230                     if (len == 8 && !isValidCodepoint(cast(dchar) next))
231                     {
232                         throw new Exception(text("Invalid UTF character \\U", unicodeHex[0 .. len]));
233                     }
234                     if (unicodeHexRef.length)
235                         throw new Exception(text("escape sequence ", next, " ",
236                                 unicodeHex[0 .. len], "not completely parsed"));
237                 }
238                 else
239                 {
240                     foreach (s; escapeSequences)
241                     {
242                         if (next == s[1])
243                             next = s[0];
244                     }
245                 }
246                 /*else
247                     throw new Exception(text("unknown escape sequence \"\\", next, "\""));*/
248             }
249             front = UnescapedChar(next, isEscaped);
250         }
251     }
252 
253     X x = X(range);
254     x.popFront;
255     return x;
256 }
257 
258 /**
259 Generates sorted array of all keys in associative array.
260 */
261 K[] sortedKeys(alias less = "a < b", K, V)(V[K] aa)
262 {
263     K[] r;
264     foreach (k, _; aa)
265         r ~= k;
266     sort!less(r);
267     return r;
268 }
269 
270 /// ditto
271 const(K)[] sortedKeys(alias less = "a < b", K, V)(const V[K] aa)
272 {
273     K[] r;
274     foreach (k, _; aa)
275         r ~= k;
276     sort!less(r);
277     return r;
278 }
279 
280 /**
281 Helper for list of todo items, which can be added while iterating.
282 */
283 class TodoList(K)
284 {
285     private Appender!(K[]) app;
286     private size_t[K] byKey;
287 
288     /**
289     Add item `key` if not already in list.
290     */
291     void put(K key)
292     {
293         if (key in byKey)
294             return;
295         size_t i = app.data.length;
296         app.put(key);
297         byKey[key] = i;
298     }
299 
300     /**
301     Add items `keys` if not already in list.
302     */
303     void put(K[] keys)
304     {
305         app.reserve(app.data.length + keys.length);
306         foreach (key; keys)
307             put(key);
308     }
309 
310     static struct Range
311     {
312         TodoList!K todoList;
313         size_t i;
314         bool empty() const
315         {
316             return i >= todoList.app.data.length;
317         }
318 
319         K front()
320         {
321             return todoList.app.data[i];
322         }
323 
324         void popFront()
325         {
326             i++;
327         }
328     }
329 
330     /**
331     Iterate over all items including items added while iterating.
332     */
333     Range keys()
334     {
335         return Range(this, 0);
336     }
337 
338     /**
339     Returns the current list of items.
340     */
341     K[] data()
342     {
343         return app.data;
344     }
345 }
346 
347 /**
348 Escape codepoint.
349 
350 Params:
351     codepoint = The codepoint to be escaped.
352     isSet = Additionally escape the characters "[]^-", so it can be used
353         in a codepoint set.
354 */
355 string escapeCodePoint(uint codepoint, bool inSet)
356 {
357     bool alwaysEscape = false;
358 
359     if (inSet && (codepoint == '[' || codepoint == ']' || codepoint == '^' || codepoint == '-'))
360         alwaysEscape = true;
361     return escapeChar(cast(dchar) codepoint, alwaysEscape);
362 }
363 
364 /**
365 Parse representation of codepoint set.
366 
367 Params:
368     spec = Specification of the codepoint set. Can contain a list of
369         single codepoints and ranges seperated by '-'. The set can be
370         inverted by starting the specification with '^'. Characters can
371         also be escaped using '\'.
372 */
373 CodepointSet codepointSetFromStr(string spec)
374 {
375     CodepointSet set;
376     auto chars = spec.byDchar.byUnescapedDchar;
377 
378     bool invert;
379     if (!chars.empty && !chars.front.isEscaped && chars.front.c == '^')
380     {
381         invert = true;
382         chars.popFront;
383     }
384 
385     while (!chars.empty)
386     {
387         dchar current = chars.front.c;
388         if (current == '-' && !chars.front.isEscaped)
389             throw new Exception("'-' at wrong place");
390         if (current == '−')
391             throw new Exception("Unexpected '−'. Did you mean '-'?");
392         try
393         {
394             chars.popFront;
395         }
396         catch (Exception e)
397         {
398             throw new Exception("Error parsing codepoint set \"" ~ spec ~ "\": " ~ e.msg);
399         }
400 
401         if (chars.front.c == '−')
402             throw new Exception("Unexpected '−'. Did you mean '-'?");
403 
404         if (!chars.empty && !chars.front.isEscaped && chars.front.c == '-')
405         {
406             chars.popFront;
407             if (chars.empty)
408                 throw new Exception("'-' at end " ~ spec);
409             if (chars.front.c == '−')
410                 throw new Exception("Unexpected '−'. Did you mean '-'?");
411 
412             dchar end = chars.front.c;
413             if (end == '-' && !chars.front.isEscaped)
414                 throw new Exception("'-' after another '-'" ~ "  " ~ spec);
415             chars.popFront;
416 
417             if (end < current)
418                 throw new Exception(text("illegal interval [", spec, "] ",
419                         current.escapeCodePoint(true), " - ", end.escapeCodePoint(true)));
420 
421             set |= CodepointSet(current, end + 1);
422         }
423         else
424         {
425             set |= CodepointSet(current, current + 1);
426         }
427     }
428     if (invert)
429         return set.inverted;
430     else
431         return set;
432 }
433 
434 /**
435 Check if character `c` belongs to the set specified by `spec`. See
436 [codepointSetFromStr] for the syntax of spec.
437 */
438 bool inCharSet(const(char)[] spec)(dchar c)
439 {
440     mixin(() {
441         auto set = codepointSetFromStr(spec);
442         string code;
443 
444         foreach (interval; set.byInterval)
445         {
446             code ~= text("if (c >= '", interval[0].escapeCodePoint(false), "' && c <= '",
447                 (interval[1] - 1).escapeCodePoint(false), "') return true;\n");
448         }
449 
450         code ~= "return false;";
451         return code;
452     }());
453 }
454 
455 ///
456 unittest
457 {
458     assert('a'.inCharSet!"a-z0-9");
459     assert('x'.inCharSet!"a-z0-9");
460     assert('z'.inCharSet!"a-z0-9");
461     assert('0'.inCharSet!"a-z0-9");
462     assert('5'.inCharSet!"a-z0-9");
463     assert('0'.inCharSet!"a-z0-9");
464 
465     assert(!'\0'.inCharSet!"a-z0-9");
466     assert(!'A'.inCharSet!"a-z0-9");
467     assert(!'-'.inCharSet!"a-z0-9");
468 
469     assert(!'a'.inCharSet!"^a-z0-9");
470     assert('#'.inCharSet!"^a-z0-9");
471 }
472 
473 string repeatChar(char c)(size_t num)
474 {
475     static Appender!(immutable(char)[]) app;
476     while (app.data.length < num)
477         app.put(c);
478     return app.data[0 .. num];
479 }
480 
481 /**
482 Allocator for array, which is used internally for temporary data.
483 */
484 struct SimpleArrayAllocator(T, size_t blockSize = 4 * 1024 - 32)
485 {
486     enum classSize = T.sizeof;
487     enum classesPerBlock = blockSize / classSize;
488 
489     private T[] data;
490     private T[][] usedData;
491     private size_t usedBlocks;
492 
493     private void nextBlock()
494     {
495         if (usedBlocks < usedData.length)
496         {
497             data = usedData[usedBlocks];
498             usedBlocks++;
499             return;
500         }
501         assert(usedBlocks == usedData.length);
502         data = new T[classesPerBlock];
503         usedData ~= data;
504         usedBlocks++;
505     }
506 
507     /**
508     Allocate array.
509 
510     Params:
511         value = Initial value used for every element.
512         num = Length of array.
513     */
514     T[] allocate(U)(U value, size_t num)
515     {
516         if (data.length < num)
517         {
518             if ((data.length < 100 || data.length < classesPerBlock / 2) && num <= classesPerBlock)
519             {
520                 nextBlock();
521             }
522             else
523             {
524                 T[] r;
525                 r.length = num;
526                 r[] = value;
527                 return r;
528             }
529         }
530         auto r = data[0 .. num];
531         data = data[num .. $];
532         r[] = value;
533         return r;
534     }
535 
536     /**
537     Clear all referenced data, so it can be reused.
538     */
539     void clearAll()
540     {
541         foreach (i; 0 .. usedBlocks)
542         {
543             usedData[i][] = T.init;
544         }
545         usedBlocks = 0;
546         data = [];
547     }
548 }
549 
550 /**
551 Convert enum to string by interpreting the members as flags.
552 */
553 string flagsToString(E)(E e) if (is(E == enum))
554 {
555     string r;
556     E unknown = e;
557     string noneEntry = "0";
558     foreach (name; __traits(allMembers, E))
559     {
560         static if (__traits(getMember, E, name) == 0)
561             noneEntry = name;
562         else if ((e & __traits(getMember, E, name)) == __traits(getMember, E, name))
563         {
564             if (r.length)
565                 r ~= "|";
566             r ~= name;
567             unknown &= ~__traits(getMember, E, name);
568         }
569     }
570     if (unknown)
571     {
572         if (r.length)
573             r ~= "|";
574         r ~= text(cast(uint) unknown);
575     }
576     if (r.length == 0)
577         r = noneEntry;
578     return r;
579 }
580 
581 ///
582 unittest
583 {
584     enum EnumTest
585     {
586         a = 1,
587         b = 2,
588         c = 6,
589         d = 24
590     }
591 
592     assert(flagsToString(EnumTest.a) == "a");
593     assert(flagsToString(EnumTest.b) == "b");
594     assert(flagsToString(EnumTest.c) == "b|c");
595     assert(flagsToString(EnumTest.d) == "d");
596     assert(flagsToString(EnumTest.a | EnumTest.b) == "a|b");
597     assert(flagsToString(EnumTest.a | EnumTest.c) == "a|b|c");
598     assert(flagsToString(EnumTest.a | EnumTest.d) == "a|d");
599     assert(flagsToString(EnumTest.b | EnumTest.d) == "b|d");
600     assert(flagsToString(EnumTest.c | EnumTest.d) == "b|c|d");
601     assert(flagsToString(EnumTest.a | EnumTest.c | EnumTest.d) == "a|b|c|d");
602     assert(flagsToString(EnumTest.c & ~EnumTest.b) == "4");
603     assert(flagsToString(cast(EnumTest) 8) == "8");
604     assert(flagsToString(cast(EnumTest) 9) == "a|8");
605 }
606 
607 /**
608 Returns member of object, which is not null, and the default value for
609 null objects.
610 */
611 auto memberOrDefault(string name, T)(T obj) if (is(T == class))
612 {
613     if (obj is null)
614         return typeof({ return __traits(getMember, obj, name); }()).init;
615     else
616         return __traits(getMember, obj, name);
617 }