1 /* 2 Implementation of std.regex IR, an intermediate representation 3 of a regular expression pattern. 4 5 This is a common ground between frontend regex component (parser) 6 and backend components - generators, matchers and other "filters". 7 */ 8 module std.regex.internal.ir; 9 10 package(std.regex): 11 12 import std.exception, std.meta, std.range.primitives, std.traits, std.uni; 13 14 debug(std_regex_parser) import std.stdio; 15 // just a common trait, may be moved elsewhere 16 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range); 17 18 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 19 20 // heuristic value determines maximum CodepointSet length suitable for linear search 21 enum maxCharsetUsed = 6; 22 23 // another variable to tweak behavior of caching generated Tries for character classes 24 enum maxCachedMatchers = 8; 25 26 alias Trie = CodepointSetTrie!(13, 8); 27 alias makeTrie = codepointSetTrie!(13, 8); 28 29 CharMatcher[CodepointSet] matcherCache; 30 31 //accessor with caching 32 @trusted CharMatcher getMatcher(CodepointSet set) 33 { 34 // almost all properties of AA are not @safe 35 // https://issues.dlang.org/show_bug.cgi?id=6357 36 if (__ctfe || maxCachedMatchers == 0) 37 return CharMatcher(set); 38 else 39 { 40 auto p = set in matcherCache; 41 if (p) 42 return *p; 43 if (matcherCache.length == maxCachedMatchers) 44 { 45 // flush enmatchers in trieCache 46 matcherCache = null; 47 } 48 return (matcherCache[set] = CharMatcher(set)); 49 } 50 } 51 52 @property ref wordMatcher()() 53 { 54 static CharMatcher matcher = CharMatcher(wordCharacter); 55 return matcher; 56 } 57 58 // some special Unicode white space characters 59 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; 60 61 //Regular expression engine/parser options: 62 // global - search all nonoverlapping matches in input 63 // casefold - case insensitive matching, do casefolding on match in unicode mode 64 // freeform - ignore whitespace in pattern, to match space use [ ] or \s 65 // multiline - switch ^, $ detect start and end of linesinstead of just start and end of input 66 enum RegexOption: uint { 67 global = 0x1, 68 casefold = 0x2, 69 freeform = 0x4, 70 nonunicode = 0x8, 71 multiline = 0x10, 72 singleline = 0x20 73 } 74 //do not reorder this list 75 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's'); 76 static assert( RegexOption.max < 0x80); 77 78 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe 79 { 80 flags &= (RegexOption.max << 1) - 1; 81 if (!flags) 82 return ""; 83 char[RegexOptionNames.length] buffer = void; 84 size_t pos = 0; 85 foreach (i, flag; __traits(allMembers, RegexOption)) 86 if (flags & __traits(getMember, RegexOption, flag)) 87 buffer[pos++] = RegexOptionNames[i]; 88 return buffer[0 .. pos].idup; 89 } 90 91 // flags that allow guide execution of engine 92 enum RegexInfo : uint { oneShot = 0x80 } 93 94 // IR bit pattern: 0b1_xxxxx_yy 95 // where yy indicates class of instruction, xxxxx for actual operation code 96 // 00: atom, a normal instruction 97 // 01: open, opening of a group, has length of contained IR in the low bits 98 // 10: close, closing of a group, has length of contained IR in the low bits 99 // 11 unused 100 // 101 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version 102 // Possible changes: 103 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) 104 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops 105 // (like lookaround), or make it easier to identify hotspots. 106 107 enum IR:uint { 108 Char = 0b1_00000_00, //a character 109 Any = 0b1_00001_00, //any character 110 CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] 111 Trie = 0b1_00011_00, //CodepointSet implemented as Trie 112 //match with any of a consecutive OrChar's in this sequence 113 //(used for case insensitive match) 114 //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ 115 //the drawback of this representation is that it is difficult 116 // to detect a jump in the middle of it 117 OrChar = 0b1_00100_00, 118 Nop = 0b1_00101_00, //no operation (padding) 119 End = 0b1_00110_00, //end of program 120 Bol = 0b1_00111_00, //beginning of a line ^ 121 Eol = 0b1_01000_00, //end of a line $ 122 Wordboundary = 0b1_01001_00, //boundary of a word 123 Notwordboundary = 0b1_01010_00, //not a word boundary 124 Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) 125 GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) 126 GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) 127 Option = 0b1_01110_00, //start of an option within an alternation x | y (length) 128 GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) 129 Bof = 0b1_10000_00, //begining of "file" (string) ^ 130 Eof = 0b1_10001_00, //end of "file" (string) $ 131 //... any additional atoms here 132 133 OrStart = 0b1_00000_01, //start of alternation group (length) 134 OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) 135 //with this instruction order 136 //bit mask 0b1_00001_00 could be used to test/set greediness 137 InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) 138 InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) 139 InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) 140 InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) 141 InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length) 142 InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex) 143 RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length) 144 RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep) 145 RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length) 146 RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) 147 148 // 149 LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length) 150 LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length) 151 NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length) 152 NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length) 153 LookbehindStart = 0b1_01000_01, //start of a lookbehind (length) 154 LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length) 155 NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length) 156 NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length) 157 } 158 159 //a shorthand for IR length - full length of specific opcode evaluated at compile time 160 template IRL(IR code) 161 { 162 enum uint IRL = lengthOfIR(code); 163 } 164 static assert(IRL!(IR.LookaheadStart) == 3); 165 166 //how many parameters follow the IR, should be optimized fixing some IR bits 167 int immediateParamsIR(IR i){ 168 switch (i) 169 { 170 case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: 171 return 1; // merge table index 172 case IR.InfiniteBloomEnd: 173 return 2; // bloom filter index + merge table index 174 case IR.RepeatEnd, IR.RepeatQEnd: 175 return 4; 176 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 177 return 2; // start-end of captures used 178 default: 179 return 0; 180 } 181 } 182 183 //full length of IR instruction inlcuding all parameters that might follow it 184 int lengthOfIR(IR i) 185 { 186 return 1 + immediateParamsIR(i); 187 } 188 189 //full length of the paired IR instruction inlcuding all parameters that might follow it 190 int lengthOfPairedIR(IR i) 191 { 192 return 1 + immediateParamsIR(pairedIR(i)); 193 } 194 195 //if the operation has a merge point (this relies on the order of the ops) 196 bool hasMerge(IR i) 197 { 198 return (i&0b11)==0b10 && i <= IR.RepeatQEnd; 199 } 200 201 //is an IR that opens a "group" 202 bool isStartIR(IR i) 203 { 204 return (i&0b11)==0b01; 205 } 206 207 //is an IR that ends a "group" 208 bool isEndIR(IR i) 209 { 210 return (i&0b11)==0b10; 211 } 212 213 //is a standalone IR 214 bool isAtomIR(IR i) 215 { 216 return (i&0b11)==0b00; 217 } 218 219 //makes respective pair out of IR i, swapping start/end bits of instruction 220 IR pairedIR(IR i) 221 { 222 assert(isStartIR(i) || isEndIR(i)); 223 return cast(IR) (i ^ 0b11); 224 } 225 226 //encoded IR instruction 227 struct Bytecode 228 { 229 uint raw; 230 //natural constraints 231 enum maxSequence = 2+4; 232 enum maxData = 1 << 22; 233 enum maxRaw = 1 << 31; 234 235 this(IR code, uint data) 236 { 237 assert(data < (1 << 22) && code < 256); 238 raw = code << 24 | data; 239 } 240 241 this(IR code, uint data, uint seq) 242 { 243 assert(data < (1 << 22) && code < 256 ); 244 assert(seq >= 2 && seq < maxSequence); 245 raw = code << 24 | (seq - 2)<<22 | data; 246 } 247 248 //store raw data 249 static Bytecode fromRaw(uint data) 250 { 251 Bytecode t; 252 t.raw = data; 253 return t; 254 } 255 256 // bit twiddling helpers 257 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 258 @property uint data()() const { return raw & 0x003f_ffff; } 259 260 @property void data()(uint val) 261 { 262 raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff); 263 } 264 265 // ditto 266 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 267 @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); } 268 269 // ditto 270 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 271 @property IR code()() const { return cast(IR)(raw >> 24); } 272 273 //ditto 274 @property bool hotspot() const { return hasMerge(code); } 275 276 //test the class of this instruction 277 @property bool isAtom() const { return isAtomIR(code); } 278 279 //ditto 280 @property bool isStart() const { return isStartIR(code); } 281 282 //ditto 283 @property bool isEnd() const { return isEndIR(code); } 284 285 //number of arguments for this instruction 286 @property int args() const { return immediateParamsIR(code); } 287 288 //mark this GroupStart or GroupEnd as referenced in backreference 289 void setBackrefence() 290 { 291 assert(code == IR.GroupStart || code == IR.GroupEnd); 292 raw = raw | 1 << 23; 293 } 294 295 //is referenced 296 @property bool backreference() const 297 { 298 assert(code == IR.GroupStart || code == IR.GroupEnd); 299 return cast(bool)(raw & 1 << 23); 300 } 301 302 //mark as local reference (for backrefs in lookarounds) 303 void setLocalRef() 304 { 305 assert(code == IR.Backref); 306 raw = raw | 1 << 23; 307 } 308 309 //is a local ref 310 @property bool localRef() const 311 { 312 assert(code == IR.Backref); 313 return cast(bool)(raw & 1 << 23); 314 } 315 316 //human readable name of instruction 317 @trusted @property string mnemonic()() const 318 {//@@@BUG@@@ to is @system 319 import std.conv : to; 320 return to!string(code); 321 } 322 323 //full length of instruction 324 @property uint length() const 325 { 326 return lengthOfIR(code); 327 } 328 329 //full length of respective start/end of this instruction 330 @property uint pairedLength() const 331 { 332 return lengthOfPairedIR(code); 333 } 334 335 //returns bytecode of paired instruction (assuming this one is start or end) 336 @property Bytecode paired() const 337 {//depends on bit and struct layout order 338 assert(isStart || isEnd); 339 return Bytecode.fromRaw(raw ^ 0b11 << 24); 340 } 341 342 //gets an index into IR block of the respective pair 343 uint indexOfPair(uint pc) const 344 { 345 assert(isStart || isEnd); 346 return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); 347 } 348 } 349 350 static assert(Bytecode.sizeof == 4); 351 352 353 //index entry structure for name --> number of submatch 354 struct NamedGroup 355 { 356 string name; 357 uint group; 358 } 359 360 //holds pair of start-end markers for a submatch 361 struct Group(DataIndex) 362 { 363 DataIndex begin = DataIndex.max; 364 DataIndex end = DataIndex.min; 365 366 bool opCast(T : bool)() const 367 { 368 return begin <= end; 369 } 370 371 @trusted string toString()() const 372 { 373 if (begin < end) 374 return "(unmatched)"; 375 import std.array : appender; 376 import std.format : formattedWrite; 377 auto a = appender!string(); 378 formattedWrite(a, "%s..%s", begin, end); 379 return a.data; 380 } 381 } 382 383 //debugging tool, prints out instruction along with opcodes 384 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) 385 { 386 import std.array : appender; 387 import std.format : formattedWrite; 388 auto output = appender!string(); 389 formattedWrite(output,"%s", irb[pc].mnemonic); 390 switch (irb[pc].code) 391 { 392 case IR.Char: 393 formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data); 394 break; 395 case IR.OrChar: 396 formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence); 397 break; 398 case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart, 399 IR.Option, IR.GotoEndOr, IR.OrStart: 400 //forward-jump instructions 401 uint len = irb[pc].data; 402 formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); 403 break; 404 case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions 405 uint len = irb[pc].data; 406 formattedWrite(output, " pc=>%u min=%u max=%u step=%u", 407 pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw); 408 break; 409 case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto 410 uint len = irb[pc].data; 411 formattedWrite(output, " pc=>%u", pc-len); 412 break; 413 case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto 414 uint len = irb[pc].data; 415 formattedWrite(output, " pc=>%u", pc-len); 416 break; 417 case IR.GroupStart, IR.GroupEnd: 418 uint n = irb[pc].data; 419 string name; 420 foreach (v;dict) 421 if (v.group == n) 422 { 423 name = "'"~v.name~"'"; 424 break; 425 } 426 formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), 427 name, n); 428 break; 429 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: 430 uint len = irb[pc].data; 431 uint start = irb[pc+1].raw, end = irb[pc+2].raw; 432 formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); 433 break; 434 case IR.Backref: case IR.CodepointSet: case IR.Trie: 435 uint n = irb[pc].data; 436 formattedWrite(output, " %u", n); 437 if (irb[pc].code == IR.Backref) 438 formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); 439 break; 440 default://all data-free instructions 441 } 442 if (irb[pc].hotspot) 443 formattedWrite(output, " Hotspot %u", irb[pc+1].raw); 444 return output.data; 445 } 446 447 //disassemble the whole chunk 448 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[]) 449 { 450 import std.stdio : writeln; 451 for (uint pc=0; pc<slice.length; pc += slice[pc].length) 452 writeln("\t", disassemble(slice, pc, dict)); 453 } 454 455 // Encapsulates memory management, explicit ref counting 456 // and the exact type of engine created 457 // there is a single instance per engine combination type x Char 458 // In future may also maintain a (TLS?) cache of memory 459 interface MatcherFactory(Char) 460 { 461 @safe: 462 Matcher!Char create(const ref Regex!Char, in Char[] input) const; 463 Matcher!Char dup(Matcher!Char m, in Char[] input) const; 464 size_t incRef(Matcher!Char m) const; 465 size_t decRef(Matcher!Char m) const; 466 } 467 468 // Only memory management, no compile-time vs run-time specialities 469 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char 470 { 471 import core.stdc.stdlib : malloc, free; 472 import core.memory : GC; 473 // round up to next multiple of size_t for alignment purposes 474 enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1); 475 476 EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const; 477 478 override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted 479 { 480 immutable size = EngineType!Char.initialMemory(re) + classSize; 481 auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; 482 scope(failure) free(memory.ptr); 483 GC.addRange(memory.ptr, classSize); 484 auto engine = construct(re, input, memory); 485 assert(engine.refCount == 1); 486 assert(cast(void*) engine == memory.ptr); 487 return engine; 488 } 489 490 override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted 491 { 492 immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize; 493 auto memory = enforce(malloc(size), "malloc failed")[0 .. size]; 494 scope(failure) free(memory.ptr); 495 auto copy = construct(engine.pattern, input, memory); 496 GC.addRange(memory.ptr, classSize); 497 engine.dupTo(copy, memory[classSize .. size]); 498 assert(copy.refCount == 1); 499 return copy; 500 } 501 502 override size_t incRef(Matcher!Char m) const 503 { 504 return ++m.refCount; 505 } 506 507 override size_t decRef(Matcher!Char m) const @trusted 508 { 509 assert(m.refCount != 0); 510 auto cnt = --m.refCount; 511 if (cnt == 0) 512 { 513 void* ptr = cast(void*) m; 514 GC.removeRange(ptr); 515 free(ptr); 516 } 517 return cnt; 518 } 519 } 520 521 // A factory for run-time engines 522 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char) 523 { 524 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 525 { 526 import std.conv : emplace; 527 return emplace!(EngineType!Char)(memory[0 .. classSize], 528 re, Input!Char(input), memory[classSize .. $]); 529 } 530 } 531 532 // A factory for compile-time engine 533 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char) 534 { 535 override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const 536 { 537 import std.conv : emplace; 538 return emplace!(EngineType!Char)(memory[0 .. classSize], 539 re, &func, Input!Char(input), memory[classSize .. $]); 540 } 541 } 542 543 // A workaround for R-T enum re = regex(...) 544 template defaultFactory(Char) 545 { 546 @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe 547 { 548 import std.regex.internal.backtracking : BacktrackingMatcher; 549 import std.regex.internal.thompson : ThompsonMatcher; 550 import std.algorithm.searching : canFind; 551 static MatcherFactory!Char backtrackingFactory; 552 static MatcherFactory!Char thompsonFactory; 553 if (re.backrefed.canFind!"a != 0") 554 { 555 if (backtrackingFactory is null) 556 backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char); 557 return backtrackingFactory; 558 } 559 else 560 { 561 if (thompsonFactory is null) 562 thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char); 563 return thompsonFactory; 564 } 565 } 566 } 567 568 // Defining it as an interface has the undesired side-effect: 569 // casting any class to an interface silently adjusts pointer to point to a nested vtbl 570 abstract class Matcher(Char) 571 { 572 abstract: 573 // Get a (next) match 574 int match(Group!size_t[] matches); 575 // This only maintains internal ref-count, 576 // deallocation happens inside MatcherFactory 577 @property ref size_t refCount() @safe; 578 // Copy internal state to another engine, using memory arena 'memory' 579 void dupTo(Matcher!Char m, void[] memory); 580 // The pattern loaded 581 @property ref const(Regex!Char) pattern() @safe; 582 // Re-arm the engine with new Input 583 Matcher rearm(in Char[] stream); 584 } 585 586 /++ 587 `Regex` object holds regular expression pattern in compiled form. 588 Instances of this object are constructed via calls to `regex`. 589 This is an intended form for caching and storage of frequently 590 used regular expressions. 591 +/ 592 struct Regex(Char) 593 { 594 //temporary workaround for identifier lookup 595 CodepointSet[] charsets; // 596 Bytecode[] ir; //compiled bytecode of pattern 597 598 599 @safe @property bool empty() const nothrow { return ir is null; } 600 601 @safe @property auto namedCaptures() 602 { 603 static struct NamedGroupRange 604 { 605 private: 606 const(NamedGroup)[] groups; 607 size_t start; 608 size_t end; 609 public: 610 this(const(NamedGroup)[] g, size_t s, size_t e) 611 { 612 assert(s <= e); 613 assert(e <= g.length); 614 groups = g; 615 start = s; 616 end = e; 617 } 618 619 @property string front() { return groups[start].name; } 620 @property string back() { return groups[end-1].name; } 621 @property bool empty() { return start >= end; } 622 @property size_t length() { return end - start; } 623 alias opDollar = length; 624 @property NamedGroupRange save() 625 { 626 return NamedGroupRange(groups, start, end); 627 } 628 void popFront() { assert(!empty); start++; } 629 void popBack() { assert(!empty); end--; } 630 string opIndex()(size_t i) 631 { 632 assert(start + i < end, 633 "Requested named group is out of range."); 634 return groups[start+i].name; 635 } 636 NamedGroupRange opSlice(size_t low, size_t high) { 637 assert(low <= high); 638 assert(start + high <= end); 639 return NamedGroupRange(groups, start + low, start + high); 640 } 641 NamedGroupRange opSlice() { return this.save; } 642 } 643 return NamedGroupRange(dict, 0, dict.length); 644 } 645 646 package(std.regex): 647 import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency 648 const(NamedGroup)[] dict; // maps name -> user group number 649 uint ngroup; // number of internal groups 650 uint maxCounterDepth; // max depth of nested {n,m} repetitions 651 uint hotspotTableSize; // number of entries in merge table 652 uint threadCount; // upper bound on number of Thompson VM threads 653 uint flags; // global regex flags 654 public const(CharMatcher)[] matchers; // tables that represent character sets 655 public const(BitTable)[] filters; // bloom filters for conditional loops 656 uint[] backrefed; // bit array of backreferenced submatches 657 Kickstart!Char kickstart; 658 MatcherFactory!Char factory; // produces optimal matcher for this pattern 659 immutable(Char)[] pattern; // copy of pattern to serve as cache key 660 661 const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted 662 { 663 auto r = cast() this; 664 r.factory = factory; 665 return r; 666 } 667 668 const(Regex) withFlags(uint newFlags) pure const @trusted 669 { 670 auto r = cast() this; 671 r.flags = newFlags; 672 return r; 673 } 674 675 const(Regex) withCode(const(Bytecode)[] code) pure const @trusted 676 { 677 auto r = cast() this; 678 r.ir = code.dup; // TODO: sidestep const instead? 679 return r; 680 } 681 682 const(Regex) withNGroup(uint nGroup) pure const @trusted 683 { 684 auto r = cast() this; 685 r.ngroup = nGroup; 686 return r; 687 } 688 689 //bit access helper 690 uint isBackref(uint n) 691 { 692 if (n/32 >= backrefed.length) 693 return 0; 694 return backrefed[n / 32] & (1 << (n & 31)); 695 } 696 697 //check if searching is not needed 698 void checkIfOneShot() 699 { 700 L_CheckLoop: 701 for (uint i = 0; i < ir.length; i += ir[i].length) 702 { 703 switch (ir[i].code) 704 { 705 case IR.Bof: 706 flags |= RegexInfo.oneShot; 707 break L_CheckLoop; 708 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof, 709 IR.Wordboundary, IR.Notwordboundary: 710 break; 711 default: 712 break L_CheckLoop; 713 } 714 } 715 } 716 717 //print out disassembly a program's IR 718 @trusted debug(std_regex_parser) void print() const 719 {//@@@BUG@@@ write is system 720 for (uint i = 0; i < ir.length; i += ir[i].length) 721 { 722 writefln("%d\t%s ", i, disassemble(ir, i, dict)); 723 } 724 writeln("Total merge table size: ", hotspotTableSize); 725 writeln("Max counter nesting depth: ", maxCounterDepth); 726 } 727 728 public string toString()() const 729 { 730 import std.format : format; 731 static if (is(typeof(pattern) : string)) 732 alias patternString = pattern; 733 else 734 { 735 import std.conv : to; 736 auto patternString = conv.to!string(pattern); 737 } 738 auto quotedEscapedPattern = format("%(%s %)", [patternString]); 739 auto flagString = regexOptionsToString(flags); 740 return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")"; 741 } 742 } 743 744 // The stuff below this point is temporarrily part of IR module 745 // but may need better place in the future (all internals) 746 package(std.regex): 747 748 //Simple UTF-string abstraction compatible with stream interface 749 struct Input(Char) 750 if (is(Char :dchar)) 751 { 752 import std.utf : decode; 753 alias DataIndex = size_t; 754 enum bool isLoopback = false; 755 alias String = const(Char)[]; 756 String _origin; 757 size_t _index; 758 759 //constructs Input object out of plain string 760 this(String input, size_t idx = 0) 761 { 762 _origin = input; 763 _index = idx; 764 } 765 766 //codepoint at current stream position 767 pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos) 768 { 769 pos = _index; 770 // DMD's inliner hates multiple return functions 771 // but can live with single statement if/else bodies 772 bool n = !(_index == _origin.length); 773 if (n) 774 res = decode(_origin, _index); 775 return n; 776 } 777 @property bool atEnd(){ 778 return _index == _origin.length; 779 } 780 bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos) 781 { 782 size_t idx = kick.search(_origin, _index); 783 _index = idx; 784 return nextChar(res, pos); 785 } 786 787 //index of at End position 788 @property size_t lastIndex(){ return _origin.length; } 789 790 //support for backtracker engine, might not be present 791 void reset(size_t index){ _index = index; } 792 793 String opSlice(size_t start, size_t end){ return _origin[start .. end]; } 794 795 auto loopBack(size_t index){ return BackLooper!Input(this, index); } 796 } 797 798 struct BackLooperImpl(Input) 799 { 800 import std.utf : strideBack; 801 alias DataIndex = size_t; 802 alias String = Input.String; 803 enum bool isLoopback = true; 804 String _origin; 805 size_t _index; 806 this(Input input, size_t index) 807 { 808 _origin = input._origin; 809 _index = index; 810 } 811 this(String input) 812 { 813 _origin = input; 814 _index = input.length; 815 } 816 @trusted bool nextChar(ref dchar res,ref size_t pos) 817 { 818 pos = _index; 819 if (_index == 0) 820 return false; 821 822 res = _origin[0.._index].back; 823 _index -= strideBack(_origin, _index); 824 825 return true; 826 } 827 @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); } 828 auto loopBack(size_t index){ return Input(_origin, index); } 829 830 //support for backtracker engine, might not be present 831 //void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; } 832 void reset(size_t index){ _index = index; } 833 834 String opSlice(size_t start, size_t end){ return _origin[end .. start]; } 835 //index of at End position 836 @property size_t lastIndex(){ return 0; } 837 } 838 839 template BackLooper(E) 840 { 841 static if (is(E : BackLooperImpl!U, U)) 842 { 843 alias BackLooper = U; 844 } 845 else 846 { 847 alias BackLooper = BackLooperImpl!E; 848 } 849 } 850 851 //both helpers below are internal, on its own are quite "explosive" 852 //unsafe, no initialization of elements 853 @system T[] mallocArray(T)(size_t len) 854 { 855 import core.stdc.stdlib : malloc; 856 return (cast(T*) malloc(len * T.sizeof))[0 .. len]; 857 } 858 859 //very unsafe, no initialization 860 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk) 861 { 862 auto ret = (cast(T*) chunk.ptr)[0 .. len]; 863 chunk = chunk[len * T.sizeof .. $]; 864 return ret; 865 } 866 867 // 868 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) 869 {//equal is @system? 870 import std.algorithm.comparison : equal; 871 import std.algorithm.iteration : map; 872 import std.conv : text; 873 import std.range : assumeSorted; 874 875 auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length; 876 enforce(fnd < dict.length && equal(dict[fnd].name, name), 877 text("no submatch named ", name)); 878 return dict[fnd].group; 879 } 880 881 // whether ch is one of unicode newline sequences 882 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 883 bool endOfLine()(dchar front, bool seenCr) 884 { 885 return ((front == '\n') ^ seenCr) || front == '\r' 886 || front == NEL || front == LS || front == PS; 887 } 888 889 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985 890 bool startOfLine()(dchar back, bool seenNl) 891 { 892 return ((back == '\r') ^ seenNl) || back == '\n' 893 || back == NEL || back == LS || back == PS; 894 } 895 896 ///Exception object thrown in case of errors during regex compilation. 897 public class RegexException : Exception 898 { 899 mixin basicExceptionCtors; 900 } 901 902 // simple 128-entry bit-table used with a hash function 903 struct BitTable { 904 uint[4] filter; 905 906 this(CodepointSet set){ 907 foreach (iv; set.byInterval) 908 { 909 foreach (v; iv.a .. iv.b) 910 add(v); 911 } 912 } 913 914 void add()(dchar ch){ 915 immutable i = index(ch); 916 filter[i >> 5] |= 1<<(i & 31); 917 } 918 // non-zero -> might be present, 0 -> absent 919 bool opIndex()(dchar ch) const{ 920 immutable i = index(ch); 921 return (filter[i >> 5]>>(i & 31)) & 1; 922 } 923 924 static uint index()(dchar ch){ 925 return ((ch >> 7) ^ ch) & 0x7F; 926 } 927 } 928 929 struct CharMatcher { 930 BitTable ascii; // fast path for ASCII 931 Trie trie; // slow path for Unicode 932 933 this(CodepointSet set) 934 { 935 auto asciiSet = set & unicode.ASCII; 936 ascii = BitTable(asciiSet); 937 trie = makeTrie(set); 938 } 939 940 bool opIndex()(dchar ch) const 941 { 942 if (ch < 0x80) 943 return ascii[ch]; 944 else 945 return trie[ch]; 946 } 947 } 948 949 // Internal non-resizeble array, switches between inline storage and CoW 950 // POD-only 951 struct SmallFixedArray(T, uint SMALL=3) 952 if (!hasElaborateDestructor!T) 953 { 954 import core.stdc.stdlib : malloc, free; 955 static struct Payload 956 { 957 size_t refcount; 958 T[0] placeholder; 959 inout(T)* ptr() inout { return placeholder.ptr; } 960 } 961 static assert(Payload.sizeof == size_t.sizeof); 962 union 963 { 964 Payload* big; 965 T[SMALL] small; 966 } 967 size_t _sizeMask; 968 enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1); 969 enum SIZE_MASK = ~BIG_MASK; 970 971 @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; } 972 @property size_t length() const { return _sizeMask & SIZE_MASK; } 973 974 this(size_t size) 975 { 976 if (size <= SMALL) 977 { 978 small[] = T.init; 979 _sizeMask = size; 980 } 981 else 982 { 983 big = cast(Payload*) enforce(malloc(Payload.sizeof + T.sizeof*size), "Failed to malloc storage"); 984 big.refcount = 1; 985 _sizeMask = size | BIG_MASK; 986 } 987 } 988 989 private @trusted @property inout(T)[] internalSlice() inout 990 { 991 return isBig ? big.ptr[0 .. length] : small[0 .. length]; 992 } 993 994 this(this) 995 { 996 if (isBig) 997 { 998 big.refcount++; 999 } 1000 } 1001 1002 bool opEquals(SmallFixedArray a) 1003 { 1004 return internalSlice[] == a.internalSlice[]; 1005 } 1006 1007 size_t toHash() const 1008 { 1009 return hashOf(internalSlice[]); 1010 } 1011 1012 ref inout(T) opIndex(size_t idx) inout 1013 { 1014 return internalSlice[idx]; 1015 } 1016 1017 // accesses big to test self-referencing so not @safe 1018 @trusted ref opAssign(SmallFixedArray arr) 1019 { 1020 if (isBig) 1021 { 1022 if (arr.isBig) 1023 { 1024 if (big is arr.big) return this; // self-assign 1025 else 1026 { 1027 abandonRef(); 1028 _sizeMask = arr._sizeMask; 1029 big = arr.big; 1030 big.refcount++; 1031 } 1032 } 1033 else 1034 { 1035 abandonRef(); 1036 _sizeMask = arr._sizeMask; 1037 small = arr.small; 1038 } 1039 } 1040 else 1041 { 1042 if (arr.isBig) 1043 { 1044 _sizeMask = arr._sizeMask; 1045 big = arr.big; 1046 big.refcount++; 1047 } 1048 else 1049 { 1050 _sizeMask = arr._sizeMask; 1051 small = arr.small; 1052 } 1053 } 1054 return this; 1055 } 1056 1057 void mutate(scope void delegate(T[]) filler) 1058 { 1059 if (isBig && big.refcount != 1) // copy on write 1060 { 1061 auto oldSizeMask = _sizeMask; 1062 auto newbig = cast(Payload*) enforce(malloc(Payload.sizeof + T.sizeof*length), "Failed to malloc storage"); 1063 newbig.refcount = 1; 1064 abandonRef(); 1065 big = newbig; 1066 _sizeMask = oldSizeMask; 1067 } 1068 filler(internalSlice); 1069 } 1070 1071 ~this() 1072 { 1073 if (isBig) 1074 { 1075 abandonRef(); 1076 } 1077 } 1078 1079 @trusted private void abandonRef() 1080 { 1081 assert(isBig); 1082 if (--big.refcount == 0) 1083 { 1084 free(big); 1085 _sizeMask = 0; 1086 assert(!isBig); 1087 } 1088 } 1089 } 1090 1091 @system unittest 1092 { 1093 alias SA = SmallFixedArray!(int, 2); 1094 SA create(int[] data) 1095 { 1096 SA a = SA(data.length); 1097 a.mutate((slice) { slice[] = data[]; }); 1098 assert(a.internalSlice == data); 1099 return a; 1100 } 1101 1102 { 1103 SA a; 1104 a = SA(1); 1105 assert(a.length == 1); 1106 a = SA.init; 1107 assert(a.length == 0); 1108 } 1109 1110 { 1111 SA a, b, c, d; 1112 assert(a.length == 0); 1113 assert(a.internalSlice == b.internalSlice); 1114 a = create([1]); 1115 assert(a.internalSlice == [1]); 1116 b = create([2, 3]); 1117 assert(b.internalSlice == [2, 3]); 1118 c = create([3, 4, 5]); 1119 d = create([5, 6, 7, 8]); 1120 assert(c.isBig); 1121 a = c; 1122 assert(a.isBig); 1123 assert(a.big is c.big); 1124 assert(a.big.refcount == 2); 1125 assert(a.internalSlice == [3, 4, 5]); 1126 assert(c.internalSlice == [3, 4, 5]); 1127 a = b; 1128 assert(!a.isBig); 1129 assert(a.internalSlice == [2, 3]); 1130 assert(c.big.refcount == 1); 1131 a = c; 1132 assert(c.big.refcount == 2); 1133 1134 // mutate copies on write if ref-count is not 1 1135 a.mutate((slice){ slice[] = 1; }); 1136 assert(a.internalSlice == [1, 1, 1]); 1137 assert(c.internalSlice == [3, 4, 5]); 1138 assert(a.isBig && c.isBig); 1139 assert(a.big.refcount == 1); 1140 assert(c.big.refcount == 1); 1141 1142 auto e = d; 1143 assert(e.big.refcount == 2); 1144 auto f = d; 1145 f = a; 1146 assert(f.isBig); 1147 assert(f.internalSlice == [1, 1, 1]); 1148 assert(f.big.refcount == 2); // a & f 1149 assert(e.big.refcount == 2); // d & e 1150 a = c; 1151 assert(f.big.refcount == 1); // f 1152 assert(e.big.refcount == 2); // d & e 1153 a = a; 1154 a = a; 1155 a = a; 1156 assert(a.big.refcount == 2); // a & c 1157 } 1158 }