1 /*
2     Implementation of std.regex IR, an intermediate representation
3     of a regular expression pattern.
4 
5     This is a common ground between frontend regex component (parser)
6     and backend components - generators, matchers and other "filters".
7 */
8 module std.regex.internal.ir;
9 
10 package(std.regex):
11 
12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
13 
14 debug(std_regex_parser) import std.stdio;
15 // just a common trait, may be moved elsewhere
16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
17 
18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
19 
20 // heuristic value determines maximum CodepointSet length suitable for linear search
21 enum maxCharsetUsed = 6;
22 
23 // another variable to tweak behavior of caching generated Tries for character classes
24 enum maxCachedMatchers = 8;
25 
26 alias Trie = CodepointSetTrie!(13, 8);
27 alias makeTrie = codepointSetTrie!(13, 8);
28 
29 CharMatcher[CodepointSet] matcherCache;
30 
31 //accessor with caching
32 @trusted CharMatcher getMatcher(CodepointSet set)
33 {
34     // almost all properties of AA are not @safe
35     // https://issues.dlang.org/show_bug.cgi?id=6357
36     if (__ctfe || maxCachedMatchers == 0)
37         return CharMatcher(set);
38     else
39     {
40         auto p = set in matcherCache;
41         if (p)
42             return *p;
43         if (matcherCache.length == maxCachedMatchers)
44         {
45             // flush enmatchers in trieCache
46             matcherCache = null;
47         }
48         return (matcherCache[set] = CharMatcher(set));
49     }
50 }
51 
52 @property ref wordMatcher()()
53 {
54     static CharMatcher matcher = CharMatcher(wordCharacter);
55     return matcher;
56 }
57 
58 // some special Unicode white space characters
59 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
60 
61 //Regular expression engine/parser options:
62 // global - search  all nonoverlapping matches in input
63 // casefold - case insensitive matching, do casefolding on match in unicode mode
64 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
65 // multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
66 enum RegexOption: uint {
67     global = 0x1,
68     casefold = 0x2,
69     freeform = 0x4,
70     nonunicode = 0x8,
71     multiline = 0x10,
72     singleline = 0x20
73 }
74 //do not reorder this list
75 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
76 static assert( RegexOption.max < 0x80);
77 
78 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
79 {
80     flags &= (RegexOption.max << 1) - 1;
81     if (!flags)
82         return "";
83     char[RegexOptionNames.length] buffer = void;
84     size_t pos = 0;
85     foreach (i, flag; __traits(allMembers, RegexOption))
86         if (flags & __traits(getMember, RegexOption, flag))
87             buffer[pos++] = RegexOptionNames[i];
88     return buffer[0 .. pos].idup;
89 }
90 
91 // flags that allow guide execution of engine
92 enum RegexInfo : uint { oneShot = 0x80 }
93 
94 // IR bit pattern: 0b1_xxxxx_yy
95 // where yy indicates class of instruction, xxxxx for actual operation code
96 //     00: atom, a normal instruction
97 //     01: open, opening of a group, has length of contained IR in the low bits
98 //     10: close, closing of a group, has length of contained IR in the low bits
99 //     11 unused
100 //
101 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
102 // Possible changes:
103 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
104 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
105 //  (like lookaround), or make it easier to identify hotspots.
106 
107 enum IR:uint {
108     Char              = 0b1_00000_00, //a character
109     Any               = 0b1_00001_00, //any character
110     CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
111     Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
112     //match with any of a consecutive OrChar's in this sequence
113     //(used for case insensitive match)
114     //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
115     //the drawback of this representation is that it is difficult
116     // to detect a jump in the middle of it
117     OrChar             = 0b1_00100_00,
118     Nop                = 0b1_00101_00, //no operation (padding)
119     End                = 0b1_00110_00, //end of program
120     Bol                = 0b1_00111_00, //beginning of a line ^
121     Eol                = 0b1_01000_00, //end of a line $
122     Wordboundary       = 0b1_01001_00, //boundary of a word
123     Notwordboundary    = 0b1_01010_00, //not a word boundary
124     Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
125     GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
126     GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
127     Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
128     GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
129     Bof                = 0b1_10000_00, //begining of "file" (string) ^
130     Eof                = 0b1_10001_00, //end of "file" (string) $
131     //... any additional atoms here
132 
133     OrStart            = 0b1_00000_01, //start of alternation group  (length)
134     OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
135     //with this instruction order
136     //bit mask 0b1_00001_00 could be used to test/set greediness
137     InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
138     InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
139     InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
140     InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
141     InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
142     InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
143     RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
144     RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
145     RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
146     RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
147 
148     //
149     LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
150     LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
151     NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
152     NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
153     LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
154     LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
155     NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
156     NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
157 }
158 
159 //a shorthand for IR length - full length of specific opcode evaluated at compile time
160 template IRL(IR code)
161 {
162     enum uint IRL =  lengthOfIR(code);
163 }
164 static assert(IRL!(IR.LookaheadStart) == 3);
165 
166 //how many parameters follow the IR, should be optimized fixing some IR bits
167 int immediateParamsIR(IR i){
168     switch (i)
169     {
170     case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
171         return 1;  // merge table index
172     case IR.InfiniteBloomEnd:
173         return 2;  // bloom filter index + merge table index
174     case IR.RepeatEnd, IR.RepeatQEnd:
175         return 4;
176     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
177         return 2;  // start-end of captures used
178     default:
179         return 0;
180     }
181 }
182 
183 //full length of IR instruction inlcuding all parameters that might follow it
184 int lengthOfIR(IR i)
185 {
186     return 1 + immediateParamsIR(i);
187 }
188 
189 //full length of the paired IR instruction inlcuding all parameters that might follow it
190 int lengthOfPairedIR(IR i)
191 {
192     return 1 + immediateParamsIR(pairedIR(i));
193 }
194 
195 //if the operation has a merge point (this relies on the order of the ops)
196 bool hasMerge(IR i)
197 {
198     return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
199 }
200 
201 //is an IR that opens a "group"
202 bool isStartIR(IR i)
203 {
204     return (i&0b11)==0b01;
205 }
206 
207 //is an IR that ends a "group"
208 bool isEndIR(IR i)
209 {
210     return (i&0b11)==0b10;
211 }
212 
213 //is a standalone IR
214 bool isAtomIR(IR i)
215 {
216     return (i&0b11)==0b00;
217 }
218 
219 //makes respective pair out of IR i, swapping start/end bits of instruction
220 IR pairedIR(IR i)
221 {
222     assert(isStartIR(i) || isEndIR(i));
223     return cast(IR) (i ^ 0b11);
224 }
225 
226 //encoded IR instruction
227 struct Bytecode
228 {
229     uint raw;
230     //natural constraints
231     enum maxSequence = 2+4;
232     enum maxData = 1 << 22;
233     enum maxRaw = 1 << 31;
234 
235     this(IR code, uint data)
236     {
237         assert(data < (1 << 22) && code < 256);
238         raw = code << 24 | data;
239     }
240 
241     this(IR code, uint data, uint seq)
242     {
243         assert(data < (1 << 22) && code < 256 );
244         assert(seq >= 2 && seq < maxSequence);
245         raw = code << 24 | (seq - 2)<<22 | data;
246     }
247 
248     //store raw data
249     static Bytecode fromRaw(uint data)
250     {
251         Bytecode t;
252         t.raw = data;
253         return t;
254     }
255 
256     // bit twiddling helpers
257     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
258     @property uint data()() const { return raw & 0x003f_ffff; }
259 
260     @property void data()(uint val)
261     {
262         raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
263     }
264 
265     // ditto
266     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
267     @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
268 
269     // ditto
270     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
271     @property IR code()() const { return cast(IR)(raw >> 24); }
272 
273     //ditto
274     @property bool hotspot() const { return hasMerge(code); }
275 
276     //test the class of this instruction
277     @property bool isAtom() const { return isAtomIR(code); }
278 
279     //ditto
280     @property bool isStart() const { return isStartIR(code); }
281 
282     //ditto
283     @property bool isEnd() const { return isEndIR(code); }
284 
285     //number of arguments for this instruction
286     @property int args() const { return immediateParamsIR(code); }
287 
288     //mark this GroupStart or GroupEnd as referenced in backreference
289     void setBackrefence()
290     {
291         assert(code == IR.GroupStart || code == IR.GroupEnd);
292         raw = raw | 1 << 23;
293     }
294 
295     //is referenced
296     @property bool backreference() const
297     {
298         assert(code == IR.GroupStart || code == IR.GroupEnd);
299         return cast(bool)(raw & 1 << 23);
300     }
301 
302     //mark as local reference (for backrefs in lookarounds)
303     void setLocalRef()
304     {
305         assert(code == IR.Backref);
306         raw = raw | 1 << 23;
307     }
308 
309     //is a local ref
310     @property bool localRef() const
311     {
312         assert(code == IR.Backref);
313         return cast(bool)(raw & 1 << 23);
314     }
315 
316     //human readable name of instruction
317     @trusted @property string mnemonic()() const
318     {//@@@BUG@@@ to is @system
319         import std.conv : to;
320         return to!string(code);
321     }
322 
323     //full length of instruction
324     @property uint length() const
325     {
326         return lengthOfIR(code);
327     }
328 
329     //full length of respective start/end of this instruction
330     @property uint pairedLength() const
331     {
332         return lengthOfPairedIR(code);
333     }
334 
335     //returns bytecode of paired instruction (assuming this one is start or end)
336     @property Bytecode paired() const
337     {//depends on bit and struct layout order
338         assert(isStart || isEnd);
339         return Bytecode.fromRaw(raw ^ 0b11 << 24);
340     }
341 
342     //gets an index into IR block of the respective pair
343     uint indexOfPair(uint pc) const
344     {
345         assert(isStart || isEnd);
346         return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
347     }
348 }
349 
350 static assert(Bytecode.sizeof == 4);
351 
352 
353 //index entry structure for name --> number of submatch
354 struct NamedGroup
355 {
356     string name;
357     uint group;
358 }
359 
360 //holds pair of start-end markers for a submatch
361 struct Group(DataIndex)
362 {
363     DataIndex begin = DataIndex.max;
364     DataIndex end   = DataIndex.min;
365 
366     bool opCast(T : bool)() const
367     {
368         return begin <= end;
369     }
370 
371     @trusted string toString()() const
372     {
373         if (begin < end)
374             return "(unmatched)";
375         import std.array : appender;
376         import std.format : formattedWrite;
377         auto a = appender!string();
378         formattedWrite(a, "%s..%s", begin, end);
379         return a.data;
380     }
381 }
382 
383 //debugging tool, prints out instruction along with opcodes
384 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
385 {
386     import std.array : appender;
387     import std.format : formattedWrite;
388     auto output = appender!string();
389     formattedWrite(output,"%s", irb[pc].mnemonic);
390     switch (irb[pc].code)
391     {
392     case IR.Char:
393         formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
394         break;
395     case IR.OrChar:
396         formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
397         break;
398     case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
399     IR.Option, IR.GotoEndOr, IR.OrStart:
400         //forward-jump instructions
401         uint len = irb[pc].data;
402         formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
403         break;
404     case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
405         uint len = irb[pc].data;
406         formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
407             pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
408         break;
409     case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
410         uint len = irb[pc].data;
411         formattedWrite(output, " pc=>%u", pc-len);
412         break;
413     case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
414         uint len = irb[pc].data;
415         formattedWrite(output, " pc=>%u", pc-len);
416         break;
417     case IR.GroupStart, IR.GroupEnd:
418         uint n = irb[pc].data;
419         string name;
420         foreach (v;dict)
421             if (v.group == n)
422             {
423                 name = "'"~v.name~"'";
424                 break;
425             }
426         formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
427                 name, n);
428         break;
429     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
430         uint len = irb[pc].data;
431         uint start = irb[pc+1].raw, end = irb[pc+2].raw;
432         formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
433         break;
434     case IR.Backref: case IR.CodepointSet: case IR.Trie:
435         uint n = irb[pc].data;
436         formattedWrite(output, " %u",  n);
437         if (irb[pc].code == IR.Backref)
438             formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
439         break;
440     default://all data-free instructions
441     }
442     if (irb[pc].hotspot)
443         formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
444     return output.data;
445 }
446 
447 //disassemble the whole chunk
448 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
449 {
450     import std.stdio : writeln;
451     for (uint pc=0; pc<slice.length; pc += slice[pc].length)
452         writeln("\t", disassemble(slice, pc, dict));
453 }
454 
455 // Encapsulates memory management, explicit ref counting
456 // and the exact type of engine created
457 // there is a single instance per engine combination type x Char
458 // In future may also maintain a (TLS?) cache of memory
459 interface MatcherFactory(Char)
460 {
461 @safe:
462     Matcher!Char create(const ref Regex!Char, in Char[] input) const;
463     Matcher!Char dup(Matcher!Char m, in Char[] input) const;
464     size_t incRef(Matcher!Char m) const;
465     size_t decRef(Matcher!Char m) const;
466 }
467 
468 // Only memory management, no compile-time vs run-time specialities
469 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
470 {
471     import core.stdc.stdlib : malloc, free;
472     import core.memory : GC;
473     // round up to next multiple of size_t for alignment purposes
474     enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);
475 
476     EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;
477 
478     override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
479     {
480         immutable size = EngineType!Char.initialMemory(re) + classSize;
481         auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
482         scope(failure) free(memory.ptr);
483         GC.addRange(memory.ptr, classSize);
484         auto engine = construct(re, input, memory);
485         assert(engine.refCount == 1);
486         assert(cast(void*) engine == memory.ptr);
487         return engine;
488     }
489 
490     override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
491     {
492         immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
493         auto memory = enforce(malloc(size), "malloc failed")[0 .. size];
494         scope(failure) free(memory.ptr);
495         auto copy = construct(engine.pattern, input, memory);
496         GC.addRange(memory.ptr, classSize);
497         engine.dupTo(copy, memory[classSize .. size]);
498         assert(copy.refCount == 1);
499         return copy;
500     }
501 
502     override size_t incRef(Matcher!Char m) const
503     {
504         return ++m.refCount;
505     }
506 
507     override size_t decRef(Matcher!Char m) const  @trusted
508     {
509         assert(m.refCount != 0);
510         auto cnt = --m.refCount;
511         if (cnt == 0)
512         {
513             void* ptr = cast(void*) m;
514             GC.removeRange(ptr);
515             free(ptr);
516         }
517         return cnt;
518     }
519 }
520 
521 // A factory for run-time engines
522 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
523 {
524     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
525     {
526         import std.conv : emplace;
527         return emplace!(EngineType!Char)(memory[0 .. classSize],
528             re, Input!Char(input), memory[classSize .. $]);
529     }
530 }
531 
532 // A factory for compile-time engine
533 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
534 {
535     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
536     {
537         import std.conv : emplace;
538         return emplace!(EngineType!Char)(memory[0 .. classSize],
539             re, &func, Input!Char(input), memory[classSize .. $]);
540     }
541 }
542 
543 // A workaround for R-T enum re = regex(...)
544 template defaultFactory(Char)
545 {
546     @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe
547     {
548         import std.regex.internal.backtracking : BacktrackingMatcher;
549         import std.regex.internal.thompson : ThompsonMatcher;
550         import std.algorithm.searching : canFind;
551         static MatcherFactory!Char backtrackingFactory;
552         static MatcherFactory!Char thompsonFactory;
553         if (re.backrefed.canFind!"a != 0")
554         {
555             if (backtrackingFactory is null)
556                 backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
557             return backtrackingFactory;
558         }
559         else
560         {
561             if (thompsonFactory is null)
562                 thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
563             return thompsonFactory;
564         }
565     }
566 }
567 
568 // Defining it as an interface has the undesired side-effect:
569 // casting any class to an interface silently adjusts pointer to point to a nested vtbl
570 abstract class Matcher(Char)
571 {
572 abstract:
573     // Get a (next) match
574     int match(Group!size_t[] matches);
575     // This only maintains internal ref-count,
576     // deallocation happens inside MatcherFactory
577     @property ref size_t refCount() @safe;
578     // Copy internal state to another engine, using memory arena 'memory'
579     void dupTo(Matcher!Char m, void[] memory);
580     // The pattern loaded
581     @property ref const(Regex!Char) pattern() @safe;
582     // Re-arm the engine with new Input
583     Matcher rearm(in Char[] stream);
584 }
585 
586 /++
587     `Regex` object holds regular expression pattern in compiled form.
588     Instances of this object are constructed via calls to `regex`.
589     This is an intended form for caching and storage of frequently
590     used regular expressions.
591 +/
592 struct Regex(Char)
593 {
594     //temporary workaround for identifier lookup
595     CodepointSet[] charsets; //
596     Bytecode[] ir;      //compiled bytecode of pattern
597 
598 
599     @safe @property bool empty() const nothrow {  return ir is null; }
600 
601     @safe @property auto namedCaptures()
602     {
603         static struct NamedGroupRange
604         {
605         private:
606             const(NamedGroup)[] groups;
607             size_t start;
608             size_t end;
609         public:
610             this(const(NamedGroup)[] g, size_t s, size_t e)
611             {
612                 assert(s <= e);
613                 assert(e <= g.length);
614                 groups = g;
615                 start = s;
616                 end = e;
617             }
618 
619             @property string front() { return groups[start].name; }
620             @property string back() { return groups[end-1].name; }
621             @property bool empty() { return start >= end; }
622             @property size_t length() { return end - start; }
623             alias opDollar = length;
624             @property NamedGroupRange save()
625             {
626                 return NamedGroupRange(groups, start, end);
627             }
628             void popFront() { assert(!empty); start++; }
629             void popBack() { assert(!empty); end--; }
630             string opIndex()(size_t i)
631             {
632                 assert(start + i < end,
633                        "Requested named group is out of range.");
634                 return groups[start+i].name;
635             }
636             NamedGroupRange opSlice(size_t low, size_t high) {
637                 assert(low <= high);
638                 assert(start + high <= end);
639                 return NamedGroupRange(groups, start + low, start + high);
640             }
641             NamedGroupRange opSlice() { return this.save; }
642         }
643         return NamedGroupRange(dict, 0, dict.length);
644     }
645 
646 package(std.regex):
647     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
648     const(NamedGroup)[] dict;              // maps name -> user group number
649     uint ngroup;                           // number of internal groups
650     uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
651     uint hotspotTableSize;                 // number of entries in merge table
652     uint threadCount;                      // upper bound on number of Thompson VM threads
653     uint flags;                            // global regex flags
654     public const(CharMatcher)[]  matchers; // tables that represent character sets
655     public const(BitTable)[] filters;      // bloom filters for conditional loops
656     uint[] backrefed;                      // bit array of backreferenced submatches
657     Kickstart!Char kickstart;
658     MatcherFactory!Char factory;           // produces optimal matcher for this pattern
659     immutable(Char)[] pattern;             // copy of pattern to serve as cache key
660 
661     const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
662     {
663         auto r = cast() this;
664         r.factory = factory;
665         return r;
666     }
667 
668     const(Regex) withFlags(uint newFlags) pure const @trusted
669     {
670         auto r = cast() this;
671         r.flags = newFlags;
672         return r;
673     }
674 
675     const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
676     {
677         auto r = cast() this;
678         r.ir = code.dup; // TODO: sidestep const instead?
679         return r;
680     }
681 
682     const(Regex) withNGroup(uint nGroup) pure const @trusted
683     {
684         auto r = cast() this;
685         r.ngroup = nGroup;
686         return r;
687     }
688 
689     //bit access helper
690     uint isBackref(uint n)
691     {
692         if (n/32 >= backrefed.length)
693             return 0;
694         return backrefed[n / 32] & (1 << (n & 31));
695     }
696 
697     //check if searching is not needed
698     void checkIfOneShot()
699     {
700     L_CheckLoop:
701         for (uint i = 0; i < ir.length; i += ir[i].length)
702         {
703             switch (ir[i].code)
704             {
705                 case IR.Bof:
706                     flags |= RegexInfo.oneShot;
707                     break L_CheckLoop;
708                 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
709                 IR.Wordboundary, IR.Notwordboundary:
710                     break;
711                 default:
712                     break L_CheckLoop;
713             }
714         }
715     }
716 
717     //print out disassembly a program's IR
718     @trusted debug(std_regex_parser) void print() const
719     {//@@@BUG@@@ write is system
720         for (uint i = 0; i < ir.length; i += ir[i].length)
721         {
722             writefln("%d\t%s ", i, disassemble(ir, i, dict));
723         }
724         writeln("Total merge table size: ", hotspotTableSize);
725         writeln("Max counter nesting depth: ", maxCounterDepth);
726     }
727 
728     public string toString()() const
729     {
730         import std.format : format;
731         static if (is(typeof(pattern) : string))
732             alias patternString = pattern;
733         else
734         {
735             import std.conv : to;
736             auto patternString = conv.to!string(pattern);
737         }
738         auto quotedEscapedPattern = format("%(%s %)", [patternString]);
739         auto flagString = regexOptionsToString(flags);
740         return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
741     }
742 }
743 
744 // The stuff below this point is temporarrily part of IR module
745 // but may need better place in the future (all internals)
746 package(std.regex):
747 
748 //Simple UTF-string abstraction compatible with stream interface
749 struct Input(Char)
750 if (is(Char :dchar))
751 {
752     import std.utf : decode;
753     alias DataIndex = size_t;
754     enum bool isLoopback = false;
755     alias String = const(Char)[];
756     String _origin;
757     size_t _index;
758 
759     //constructs Input object out of plain string
760     this(String input, size_t idx = 0)
761     {
762         _origin = input;
763         _index = idx;
764     }
765 
766     //codepoint at current stream position
767     pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
768     {
769         pos = _index;
770         // DMD's inliner hates multiple return functions
771         // but can live with single statement if/else bodies
772         bool n = !(_index == _origin.length);
773         if (n)
774             res = decode(_origin, _index);
775         return n;
776     }
777     @property bool atEnd(){
778         return _index == _origin.length;
779     }
780     bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
781     {
782         size_t idx = kick.search(_origin, _index);
783         _index = idx;
784         return nextChar(res, pos);
785     }
786 
787     //index of at End position
788     @property size_t lastIndex(){   return _origin.length; }
789 
790     //support for backtracker engine, might not be present
791     void reset(size_t index){   _index = index;  }
792 
793     String opSlice(size_t start, size_t end){   return _origin[start .. end]; }
794 
795     auto loopBack(size_t index){   return BackLooper!Input(this, index); }
796 }
797 
798 struct BackLooperImpl(Input)
799 {
800     import std.utf : strideBack;
801     alias DataIndex = size_t;
802     alias String = Input.String;
803     enum bool isLoopback = true;
804     String _origin;
805     size_t _index;
806     this(Input input, size_t index)
807     {
808         _origin = input._origin;
809         _index = index;
810     }
811     this(String input)
812     {
813         _origin = input;
814         _index = input.length;
815     }
816     @trusted bool nextChar(ref dchar res,ref size_t pos)
817     {
818         pos = _index;
819         if (_index == 0)
820             return false;
821 
822         res = _origin[0.._index].back;
823         _index -= strideBack(_origin, _index);
824 
825         return true;
826     }
827     @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
828     auto loopBack(size_t index){   return Input(_origin, index); }
829 
830     //support for backtracker engine, might not be present
831     //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
832     void reset(size_t index){   _index = index;  }
833 
834     String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
835     //index of at End position
836     @property size_t lastIndex(){   return 0; }
837 }
838 
839 template BackLooper(E)
840 {
841     static if (is(E : BackLooperImpl!U, U))
842     {
843         alias BackLooper = U;
844     }
845     else
846     {
847         alias BackLooper = BackLooperImpl!E;
848     }
849 }
850 
851 //both helpers below are internal, on its own are quite "explosive"
852 //unsafe, no initialization of elements
853 @system T[] mallocArray(T)(size_t len)
854 {
855     import core.stdc.stdlib : malloc;
856     return (cast(T*) malloc(len * T.sizeof))[0 .. len];
857 }
858 
859 //very unsafe, no initialization
860 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
861 {
862     auto ret = (cast(T*) chunk.ptr)[0 .. len];
863     chunk = chunk[len * T.sizeof .. $];
864     return ret;
865 }
866 
867 //
868 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
869 {//equal is @system?
870     import std.algorithm.comparison : equal;
871     import std.algorithm.iteration : map;
872     import std.conv : text;
873     import std.range : assumeSorted;
874 
875     auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
876     enforce(fnd < dict.length && equal(dict[fnd].name, name),
877         text("no submatch named ", name));
878     return dict[fnd].group;
879 }
880 
881 // whether ch is one of unicode newline sequences
882 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
883 bool endOfLine()(dchar front, bool seenCr)
884 {
885     return ((front == '\n') ^ seenCr) || front == '\r'
886     || front == NEL || front == LS || front == PS;
887 }
888 
889 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
890 bool startOfLine()(dchar back, bool seenNl)
891 {
892     return ((back == '\r') ^ seenNl) || back == '\n'
893     || back == NEL || back == LS || back == PS;
894 }
895 
896 ///Exception object thrown in case of errors during regex compilation.
897 public class RegexException : Exception
898 {
899     mixin basicExceptionCtors;
900 }
901 
902 // simple 128-entry bit-table used with a hash function
903 struct BitTable {
904     uint[4] filter;
905 
906     this(CodepointSet set){
907         foreach (iv; set.byInterval)
908         {
909             foreach (v; iv.a .. iv.b)
910                 add(v);
911         }
912     }
913 
914     void add()(dchar ch){
915         immutable i = index(ch);
916         filter[i >> 5]  |=  1<<(i & 31);
917     }
918     // non-zero -> might be present, 0 -> absent
919     bool opIndex()(dchar ch) const{
920         immutable i = index(ch);
921         return (filter[i >> 5]>>(i & 31)) & 1;
922     }
923 
924     static uint index()(dchar ch){
925         return ((ch >> 7) ^ ch) & 0x7F;
926     }
927 }
928 
929 struct CharMatcher {
930     BitTable ascii; // fast path for ASCII
931     Trie trie;      // slow path for Unicode
932 
933     this(CodepointSet set)
934     {
935         auto asciiSet = set & unicode.ASCII;
936         ascii = BitTable(asciiSet);
937         trie = makeTrie(set);
938     }
939 
940     bool opIndex()(dchar ch) const
941     {
942         if (ch < 0x80)
943             return ascii[ch];
944         else
945             return trie[ch];
946     }
947 }
948 
949 // Internal non-resizeble array, switches between inline storage and CoW
950 // POD-only
951 struct SmallFixedArray(T, uint SMALL=3)
952 if (!hasElaborateDestructor!T)
953 {
954     import core.stdc.stdlib : malloc, free;
955     static struct Payload
956     {
957         size_t refcount;
958         T[0] placeholder;
959         inout(T)* ptr() inout { return placeholder.ptr; }
960     }
961     static assert(Payload.sizeof == size_t.sizeof);
962     union
963     {
964         Payload* big;
965         T[SMALL] small;
966     }
967     size_t _sizeMask;
968     enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
969     enum SIZE_MASK = ~BIG_MASK;
970 
971     @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
972     @property size_t length() const { return _sizeMask & SIZE_MASK; }
973 
974     this(size_t size)
975     {
976         if (size <= SMALL)
977         {
978             small[] = T.init;
979             _sizeMask = size;
980         }
981         else
982         {
983             big = cast(Payload*) enforce(malloc(Payload.sizeof + T.sizeof*size), "Failed to malloc storage");
984             big.refcount = 1;
985             _sizeMask = size | BIG_MASK;
986         }
987     }
988 
989     private @trusted @property inout(T)[] internalSlice() inout
990     {
991         return isBig ? big.ptr[0 .. length] : small[0 .. length];
992     }
993 
994     this(this)
995     {
996         if (isBig)
997         {
998             big.refcount++;
999         }
1000     }
1001 
1002     bool opEquals(SmallFixedArray a)
1003     {
1004         return internalSlice[] == a.internalSlice[];
1005     }
1006 
1007     size_t toHash() const
1008     {
1009         return hashOf(internalSlice[]);
1010     }
1011 
1012     ref inout(T) opIndex(size_t idx) inout
1013     {
1014         return internalSlice[idx];
1015     }
1016 
1017     // accesses big to test self-referencing so not @safe
1018     @trusted ref opAssign(SmallFixedArray arr)
1019     {
1020         if (isBig)
1021         {
1022             if (arr.isBig)
1023             {
1024                 if (big is arr.big) return this; // self-assign
1025                 else
1026                 {
1027                     abandonRef();
1028                     _sizeMask = arr._sizeMask;
1029                     big = arr.big;
1030                     big.refcount++;
1031                 }
1032             }
1033             else
1034             {
1035                 abandonRef();
1036                 _sizeMask = arr._sizeMask;
1037                 small = arr.small;
1038             }
1039         }
1040         else
1041         {
1042             if (arr.isBig)
1043             {
1044                 _sizeMask = arr._sizeMask;
1045                 big = arr.big;
1046                 big.refcount++;
1047             }
1048             else
1049             {
1050                 _sizeMask = arr._sizeMask;
1051                 small = arr.small;
1052             }
1053         }
1054         return this;
1055     }
1056 
1057     void mutate(scope void delegate(T[]) filler)
1058     {
1059         if (isBig && big.refcount != 1) // copy on write
1060         {
1061             auto oldSizeMask = _sizeMask;
1062             auto newbig = cast(Payload*) enforce(malloc(Payload.sizeof + T.sizeof*length), "Failed to malloc storage");
1063             newbig.refcount = 1;
1064             abandonRef();
1065             big = newbig;
1066             _sizeMask = oldSizeMask;
1067         }
1068         filler(internalSlice);
1069     }
1070 
1071     ~this()
1072     {
1073         if (isBig)
1074         {
1075             abandonRef();
1076         }
1077     }
1078 
1079     @trusted private void abandonRef()
1080     {
1081         assert(isBig);
1082         if (--big.refcount == 0)
1083         {
1084             free(big);
1085             _sizeMask = 0;
1086             assert(!isBig);
1087         }
1088     }
1089 }
1090 
1091 @system unittest
1092 {
1093     alias SA = SmallFixedArray!(int, 2);
1094     SA create(int[] data)
1095     {
1096         SA a = SA(data.length);
1097         a.mutate((slice) { slice[] = data[]; });
1098         assert(a.internalSlice == data);
1099         return a;
1100     }
1101 
1102     {
1103         SA a;
1104         a = SA(1);
1105         assert(a.length == 1);
1106         a = SA.init;
1107         assert(a.length == 0);
1108     }
1109 
1110     {
1111         SA a, b, c, d;
1112         assert(a.length == 0);
1113         assert(a.internalSlice == b.internalSlice);
1114         a = create([1]);
1115         assert(a.internalSlice == [1]);
1116         b = create([2, 3]);
1117         assert(b.internalSlice == [2, 3]);
1118         c = create([3, 4, 5]);
1119         d = create([5, 6, 7, 8]);
1120         assert(c.isBig);
1121         a = c;
1122         assert(a.isBig);
1123         assert(a.big is c.big);
1124         assert(a.big.refcount == 2);
1125         assert(a.internalSlice == [3, 4, 5]);
1126         assert(c.internalSlice == [3, 4, 5]);
1127         a = b;
1128         assert(!a.isBig);
1129         assert(a.internalSlice == [2, 3]);
1130         assert(c.big.refcount == 1);
1131         a = c;
1132         assert(c.big.refcount == 2);
1133 
1134         // mutate copies on write if ref-count is not 1
1135         a.mutate((slice){ slice[] = 1; });
1136         assert(a.internalSlice == [1, 1, 1]);
1137         assert(c.internalSlice == [3, 4, 5]);
1138         assert(a.isBig && c.isBig);
1139         assert(a.big.refcount == 1);
1140         assert(c.big.refcount == 1);
1141 
1142         auto e = d;
1143         assert(e.big.refcount == 2);
1144         auto f = d;
1145         f = a;
1146         assert(f.isBig);
1147         assert(f.internalSlice == [1, 1, 1]);
1148         assert(f.big.refcount == 2); // a & f
1149         assert(e.big.refcount == 2); // d & e
1150         a = c;
1151         assert(f.big.refcount == 1); // f
1152         assert(e.big.refcount == 2); // d & e
1153         a = a;
1154         a = a;
1155         a = a;
1156         assert(a.big.refcount == 2); // a & c
1157     }
1158 }
Suggestion Box / Bug Report