1 /++ 2 $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions) 3 are a commonly used method of pattern matching 4 on strings, with $(I regex) being a catchy word for a pattern in this domain 5 specific language. Typical problems usually solved by regular expressions 6 include validation of user input and the ubiquitous find $(AMP) replace 7 in text processing utilities. 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Matching) $(TD 14 $(LREF bmatch) 15 $(LREF match) 16 $(LREF matchAll) 17 $(LREF matchFirst) 18 )) 19 $(TR $(TD Building) $(TD 20 $(LREF ctRegex) 21 $(LREF escaper) 22 $(LREF regex) 23 )) 24 $(TR $(TD Replace) $(TD 25 $(LREF replace) 26 $(LREF replaceAll) 27 $(LREF replaceAllInto) 28 $(LREF replaceFirst) 29 $(LREF replaceFirstInto) 30 )) 31 $(TR $(TD Split) $(TD 32 $(LREF split) 33 $(LREF splitter) 34 )) 35 $(TR $(TD Objects) $(TD 36 $(LREF Captures) 37 $(LREF Regex) 38 $(LREF RegexException) 39 $(LREF RegexMatch) 40 $(LREF Splitter) 41 $(LREF StaticRegex) 42 )) 43 )) 44 45 $(SECTION Synopsis) 46 --- 47 import std.regex; 48 import std.stdio; 49 void main() 50 { 51 // Print out all possible dd/mm/yy(yy) dates found in user input. 52 auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b"); 53 foreach (line; stdin.byLine) 54 { 55 // matchAll() returns a range that can be iterated 56 // to get all subsequent matches. 57 foreach (c; matchAll(line, r)) 58 writeln(c.hit); 59 } 60 } 61 ... 62 63 // Create a static regex at compile-time, which contains fast native code. 64 auto ctr = ctRegex!(`^.*/([^/]+)/?$`); 65 66 // It works just like a normal regex: 67 auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any 68 assert(!c2.empty); // Be sure to check if there is a match before examining contents! 69 assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match. 70 71 ... 72 // multi-pattern regex 73 auto multi = regex([`\d+,\d+`,`(a-z]+):(\d+)`]); 74 auto m = "abc:43 12,34".matchAll(multi); 75 assert(m.front.whichPattern == 2); 76 assert(m.front[1] == "abc"); 77 assert(m.front[2] == "43"); 78 m.popFront(); 79 assert(m.front.whichPattern == 1); 80 assert(m.front[1] == "12"); 81 ... 82 83 // The result of the `matchAll/matchFirst` is directly testable with if/assert/while. 84 // e.g. test if a string consists of letters: 85 assert(matchFirst("Letter", `^\p{L}+$`)); 86 --- 87 88 $(SECTION Syntax and general information) 89 The general usage guideline is to keep regex complexity on the side of simplicity, 90 as its capabilities reside in purely character-level manipulation. 91 As such it's ill-suited for tasks involving higher level invariants 92 like matching an integer number $(U bounded) in an [a,b] interval. 93 Checks of this sort of are better addressed by additional post-processing. 94 95 The basic syntax shouldn't surprise experienced users of regular expressions. 96 For an introduction to `std.regex` see a 97 $(HTTP dlang.org/regular-expression.html, short tour) of the module API 98 and its abilities. 99 100 There are other web resources on regular expressions to help newcomers, 101 and a good $(HTTP www.regular-expressions.info, reference with tutorial) 102 can easily be found. 103 104 This library uses a remarkably common ECMAScript syntax flavor 105 with the following extensions: 106 $(UL 107 $(LI Named subexpressions, with Python syntax. ) 108 $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) 109 $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) 110 ) 111 112 $(REG_START Pattern syntax ) 113 $(I std.regex operates on codepoint level, 114 'character' in this table denotes a single Unicode codepoint.) 115 $(REG_TABLE 116 $(REG_TITLE Pattern element, Semantics ) 117 $(REG_TITLE Atoms, Match single characters ) 118 $(REG_ROW any character except [{|*+?()^$, Matches the character itself. ) 119 $(REG_ROW ., In single line mode matches any character. 120 Otherwise it matches any character except '\n' and '\r'. ) 121 $(REG_ROW [class], Matches a single character 122 that belongs to this character class. ) 123 $(REG_ROW [^class], Matches a single character that 124 does $(U not) belong to this character class.) 125 $(REG_ROW \cC, Matches the control character corresponding to letter C) 126 $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) 127 $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) 128 $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) 129 $(REG_ROW \f, Matches a formfeed character. ) 130 $(REG_ROW \n, Matches a linefeed character. ) 131 $(REG_ROW \r, Matches a carriage return character. ) 132 $(REG_ROW \t, Matches a tab character. ) 133 $(REG_ROW \v, Matches a vertical tab character. ) 134 $(REG_ROW \d, Matches any Unicode digit. ) 135 $(REG_ROW \D, Matches any character except Unicode digits. ) 136 $(REG_ROW \w, Matches any word character (note: this includes numbers).) 137 $(REG_ROW \W, Matches any non-word character.) 138 $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) 139 $(REG_ROW \S, Matches any character except those recognized as $(I \s ). ) 140 $(REG_ROW \\\\, Matches \ character. ) 141 $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) 142 $(REG_ROW \p{PropertyName}, Matches a character that belongs 143 to the Unicode PropertyName set. 144 Single letter abbreviations can be used without surrounding {,}. ) 145 $(REG_ROW \P{PropertyName}, Matches a character that does not belong 146 to the Unicode PropertyName set. 147 Single letter abbreviations can be used without surrounding {,}. ) 148 $(REG_ROW \p{InBasicLatin}, Matches any character that is part of 149 the BasicLatin Unicode $(U block).) 150 $(REG_ROW \P{InBasicLatin}, Matches any character except ones in 151 the BasicLatin Unicode $(U block).) 152 $(REG_ROW \p{Cyrillic}, Matches any character that is part of 153 Cyrillic $(U script).) 154 $(REG_ROW \P{Cyrillic}, Matches any character except ones in 155 Cyrillic $(U script).) 156 $(REG_TITLE Quantifiers, Specify repetition of other elements) 157 $(REG_ROW *, Matches previous character/subexpression 0 or more times. 158 Greedy version - tries as many times as possible.) 159 $(REG_ROW *?, Matches previous character/subexpression 0 or more times. 160 Lazy version - stops as early as possible.) 161 $(REG_ROW +, Matches previous character/subexpression 1 or more times. 162 Greedy version - tries as many times as possible.) 163 $(REG_ROW +?, Matches previous character/subexpression 1 or more times. 164 Lazy version - stops as early as possible.) 165 $(REG_ROW {n}, Matches previous character/subexpression exactly n times. ) 166 $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more. 167 Greedy version - tries as many times as possible. ) 168 $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more. 169 Lazy version - stops as early as possible.) 170 $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times. 171 Greedy version - tries as many times as possible, but no more than m times. ) 172 $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times. 173 Lazy version - stops as early as possible, but no less then n times.) 174 $(REG_TITLE Other, Subexpressions $(AMP) alternations ) 175 $(REG_ROW (regex), Matches subexpression regex, 176 saving matched portion of text for later retrieval. ) 177 $(REG_ROW (?#comment), An inline comment that is ignored while matching.) 178 $(REG_ROW (?:regex), Matches subexpression regex, 179 $(U not) saving matched portion of text. Useful to speed up matching. ) 180 $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. ) 181 $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression 182 regex labeling it with name 'name'. 183 When referring to a matched portion of text, 184 names work like aliases in addition to direct numbers. 185 ) 186 $(REG_TITLE Assertions, Match position rather than character ) 187 $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).) 188 $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) 189 $(REG_ROW \b, Matches at word boundary. ) 190 $(REG_ROW \B, Matches when $(U not) at word boundary. ) 191 $(REG_ROW (?=regex), Zero-width lookahead assertion. 192 Matches at a point where the subexpression 193 regex could be matched starting from the current position. 194 ) 195 $(REG_ROW (?!regex), Zero-width negative lookahead assertion. 196 Matches at a point where the subexpression 197 regex could $(U not) be matched starting from the current position. 198 ) 199 $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point 200 where the subexpression regex could be matched ending 201 at the current position (matching goes backwards). 202 ) 203 $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion. 204 Matches at a point where the subexpression regex could $(U not) 205 be matched ending at the current position (matching goes backwards). 206 ) 207 ) 208 209 $(REG_START Character classes ) 210 $(REG_TABLE 211 $(REG_TITLE Pattern element, Semantics ) 212 $(REG_ROW Any atom, Has the same meaning as outside of a character class.) 213 $(REG_ROW a-z, Includes characters a, b, c, ..., z. ) 214 $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b], 215 Where a, b are arbitrary classes, means union, set difference, 216 symmetric set difference, and intersection respectively. 217 $(I Any sequence of character class elements implicitly forms a union.) ) 218 ) 219 220 $(REG_START Regex flags ) 221 $(REG_TABLE 222 $(REG_TITLE Flag, Semantics ) 223 $(REG_ROW g, Global regex, repeat over the whole input. ) 224 $(REG_ROW i, Case insensitive matching. ) 225 $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators 226 as well as start and end of input.) 227 $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. ) 228 $(REG_ROW x, Free-form syntax, ignores whitespace in pattern, 229 useful for formatting complex regular expressions. ) 230 ) 231 232 $(SECTION Unicode support) 233 234 This library provides full Level 1 support* according to 235 $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically: 236 $(UL 237 $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.) 238 $(LI 1.2 Unicode properties.) 239 $(LI 1.3 Character classes with set operations.) 240 $(LI 1.4 Word boundaries use the full set of "word" characters.) 241 $(LI 1.5 Using simple casefolding to match case 242 insensitively across the full range of codepoints.) 243 $(LI 1.6 Respecting line breaks as any of 244 \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.) 245 $(LI 1.7 Operating on codepoint level.) 246 ) 247 *With exception of point 1.1.1, as of yet, normalization of input 248 is expected to be enforced by user. 249 250 $(SECTION Replace format string) 251 252 A set of functions in this module that do the substitution rely 253 on a simple format to guide the process. In particular the table below 254 applies to the `format` argument of 255 $(LREF replaceFirst) and $(LREF replaceAll). 256 257 The format string can reference parts of match using the following notation. 258 $(REG_TABLE 259 $(REG_TITLE Format specifier, Replaced by ) 260 $(REG_ROW $(DOLLAR)$(AMP), the whole match. ) 261 $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. ) 262 $(REG_ROW $', part of input $(I following) the match. ) 263 $(REG_ROW $$, '$' character. ) 264 $(REG_ROW \c $(COMMA) where c is any character, the character c itself. ) 265 $(REG_ROW \\\\, '\\' character. ) 266 $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. ) 267 ) 268 269 $(SECTION Slicing and zero memory allocations orientation) 270 271 All matches returned by pattern matching functionality in this library 272 are slices of the original input. The notable exception is the `replace` 273 family of functions that generate a new string from the input. 274 275 In cases where producing the replacement is the ultimate goal 276 $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy 277 as functions that avoid allocations even for replacement. 278 279 Copyright: Copyright Dmitry Olshansky, 2011- 280 281 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 282 283 Authors: Dmitry Olshansky, 284 285 API and utility constructs are modeled after the original `std.regex` 286 by Walter Bright and Andrei Alexandrescu. 287 288 Source: $(PHOBOSSRC std/regex/package.d) 289 290 Macros: 291 REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) ) 292 REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) ) 293 REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table> 294 REG_START = <h3><div align="center"> $0 </div></h3> 295 SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3> 296 S_LINK = <a href="#$1">$+</a> 297 +/ 298 module std.regex; 299 300 import std.range.primitives, std.traits; 301 import std.regex.internal.ir; 302 import std.typecons : Flag, Yes, No; 303 304 /++ 305 `Regex` object holds regular expression pattern in compiled form. 306 307 Instances of this object are constructed via calls to `regex`. 308 This is an intended form for caching and storage of frequently 309 used regular expressions. 310 311 Example: 312 313 Test if this object doesn't contain any compiled pattern. 314 --- 315 Regex!char r; 316 assert(r.empty); 317 r = regex(""); // Note: "" is a valid regex pattern. 318 assert(!r.empty); 319 --- 320 321 Getting a range of all the named captures in the regex. 322 ---- 323 import std.range; 324 import std.algorithm; 325 326 auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`); 327 auto nc = re.namedCaptures; 328 static assert(isRandomAccessRange!(typeof(nc))); 329 assert(!nc.empty); 330 assert(nc.length == 2); 331 assert(nc.equal(["name", "var"])); 332 assert(nc[0] == "name"); 333 assert(nc[1..$].equal(["var"])); 334 ---- 335 +/ 336 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); 337 338 /++ 339 A `StaticRegex` is `Regex` object that contains D code specially 340 generated at compile-time to speed up matching. 341 342 No longer used, kept as alias to Regex for backwards compatibility. 343 +/ 344 public alias StaticRegex = Regex; 345 346 /++ 347 Compile regular expression pattern for the later execution. 348 Returns: `Regex` object that works on inputs having 349 the same character width as `pattern`. 350 351 Params: 352 pattern = A single regular expression to match. 353 patterns = An array of regular expression strings. 354 The resulting `Regex` object will match any expression; 355 use $(LREF whichPattern) to know which. 356 flags = The _attributes (g, i, m, s and x accepted) 357 358 Throws: `RegexException` if there were any errors during compilation. 359 +/ 360 @trusted public auto regex(S : C[], C)(const S[] patterns, const(char)[] flags="") 361 if (isSomeString!(S)) 362 { 363 import std.array : appender; 364 import std.functional : memoize; 365 enum cacheSize = 8; //TODO: invent nice interface to control regex caching 366 const(C)[] pat; 367 if (patterns.length > 1) 368 { 369 auto app = appender!S(); 370 foreach (i, p; patterns) 371 { 372 if (i != 0) 373 app.put("|"); 374 app.put("(?:"); 375 app.put(patterns[i]); 376 // terminator for the pattern 377 // to detect if the pattern unexpectedly ends 378 app.put("\\"); 379 app.put(cast(dchar)(privateUseStart+i)); 380 app.put(")"); 381 // another one to return correct whichPattern 382 // for all of potential alternatives in the patterns[i] 383 app.put("\\"); 384 app.put(cast(dchar)(privateUseStart+i)); 385 } 386 pat = app.data; 387 } 388 else 389 pat = patterns[0]; 390 391 if (__ctfe) 392 return regexImpl(pat, flags); 393 return memoize!(regexImpl!S, cacheSize)(pat, flags); 394 } 395 396 ///ditto 397 @trusted public auto regex(S)(S pattern, const(char)[] flags="") 398 if (isSomeString!(S)) 399 { 400 return regex([pattern], flags); 401 } 402 403 /// 404 @system unittest 405 { 406 void test(S)() 407 { 408 // multi-pattern regex example 409 S[] arr = [`([a-z]+):(\d+)`, `(\d+),\d+`]; 410 auto multi = regex(arr); // multi regex 411 S str = "abc:43 12,34"; 412 auto m = str.matchAll(multi); 413 assert(m.front.whichPattern == 1); 414 assert(m.front[1] == "abc"); 415 assert(m.front[2] == "43"); 416 m.popFront(); 417 assert(m.front.whichPattern == 2); 418 assert(m.front[1] == "12"); 419 } 420 421 import std.meta : AliasSeq; 422 static foreach (C; AliasSeq!(string, wstring, dstring)) 423 // Test with const array of patterns - see https://issues.dlang.org/show_bug.cgi?id=20301 424 static foreach (S; AliasSeq!(C, const C, immutable C)) 425 test!S(); 426 } 427 428 @system unittest 429 { 430 import std.conv : to; 431 import std..string : indexOf; 432 433 immutable pattern = "s+"; 434 auto regexString = to!string(regex(pattern, "U")); 435 assert(regexString.length <= pattern.length + 100, "String representation shouldn't be unreasonably bloated."); 436 assert(indexOf(regexString, "s+") >= 0, "String representation should include pattern."); 437 assert(indexOf(regexString, 'U') >= 0, "String representation should include flags."); 438 } 439 440 public auto regexImpl(S)(const S pattern, const(char)[] flags="") 441 if (isSomeString!(typeof(pattern))) 442 { 443 import std.regex.internal.parser : Parser, CodeGen; 444 auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags); 445 auto r = parser.program; 446 return r; 447 } 448 449 450 private struct CTRegexWrapper(Char) 451 { 452 private immutable(Regex!Char)* re; 453 454 // allow code that expects mutable Regex to still work 455 // we stay "logically const" 456 @property @trusted ref getRe() const { return *cast(Regex!Char*) re; } 457 alias getRe this; 458 } 459 460 template ctRegexImpl(alias pattern, string flags=[]) 461 { 462 import std.regex.internal.backtracking, std.regex.internal.parser; 463 static immutable r = cast(immutable) regex(pattern, flags); 464 alias Char = BasicElementOf!(typeof(pattern)); 465 enum source = ctGenRegExCode(r); 466 @trusted bool func(BacktrackingMatcher!Char matcher) 467 { 468 debug(std_regex_ctr) pragma(msg, source); 469 cast(void) matcher; 470 mixin(source); 471 } 472 static immutable staticRe = 473 cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func)); 474 enum wrapper = CTRegexWrapper!Char(&staticRe); 475 } 476 477 @safe unittest 478 { 479 // test compat for logical const workaround 480 static void test(StaticRegex!char) 481 { 482 } 483 enum re = ctRegex!``; 484 test(re); 485 } 486 487 @safe unittest 488 { 489 auto re = ctRegex!`foo`; 490 assert(matchFirst("foo", re)); 491 492 // test reassignment 493 re = ctRegex!`bar`; 494 assert(matchFirst("bar", re)); 495 assert(!matchFirst("bar", ctRegex!`foo`)); 496 } 497 498 /++ 499 Compile regular expression using CTFE 500 and generate optimized native machine code for matching it. 501 502 Returns: StaticRegex object for faster matching. 503 504 Params: 505 pattern = Regular expression 506 flags = The _attributes (g, i, m, s and x accepted) 507 +/ 508 public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).wrapper; 509 510 enum isRegexFor(RegEx, R) = is(immutable RegEx == immutable Regex!(BasicElementOf!R)) 511 || is(RegEx : const(Regex!(BasicElementOf!R))) 512 || is(immutable RegEx == immutable StaticRegex!(BasicElementOf!R)); 513 514 515 /++ 516 `Captures` object contains submatches captured during a call 517 to `match` or iteration over `RegexMatch` range. 518 519 First element of range is the whole match. 520 +/ 521 @trusted public struct Captures(R) 522 if (isSomeString!R) 523 {//@trusted because of union inside 524 alias DataIndex = size_t; 525 alias String = R; 526 alias Store = SmallFixedArray!(Group!DataIndex, 3); 527 private: 528 import std.conv : text; 529 Store matches; 530 const(NamedGroup)[] _names; 531 R _input; 532 int _nMatch; 533 uint _f, _b; 534 535 this(R input, uint n, const(NamedGroup)[] named) 536 { 537 _input = input; 538 _names = named; 539 matches = Store(n); 540 _b = n; 541 _f = 0; 542 } 543 544 this(ref RegexMatch!R rmatch) 545 { 546 _input = rmatch._input; 547 _names = rmatch._engine.pattern.dict; 548 immutable n = rmatch._engine.pattern.ngroup; 549 matches = Store(n); 550 _b = n; 551 _f = 0; 552 } 553 554 inout(R) getMatch(size_t index) inout 555 { 556 auto m = &matches[index]; 557 return *m ? _input[m.begin .. m.end] : null; 558 } 559 560 public: 561 ///Slice of input prior to the match. 562 @property R pre() 563 { 564 return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin]; 565 } 566 567 ///Slice of input immediately after the match. 568 @property R post() 569 { 570 return _nMatch == 0 ? _input[] : _input[matches[0].end .. $]; 571 } 572 573 ///Slice of matched portion of input. 574 @property R hit() 575 { 576 assert(_nMatch, "attempted to get hit of an empty match"); 577 return _input[matches[0].begin .. matches[0].end]; 578 } 579 580 ///Range interface. 581 @property R front() 582 { 583 assert(_nMatch, "attempted to get front of an empty match"); 584 return getMatch(_f); 585 } 586 587 ///ditto 588 @property R back() 589 { 590 assert(_nMatch, "attempted to get back of an empty match"); 591 return getMatch(_b - 1); 592 } 593 594 ///ditto 595 void popFront() 596 { 597 assert(!empty); 598 ++_f; 599 } 600 601 ///ditto 602 void popBack() 603 { 604 assert(!empty); 605 --_b; 606 } 607 608 ///ditto 609 @property bool empty() const { return _nMatch == 0 || _f >= _b; } 610 611 ///ditto 612 inout(R) opIndex()(size_t i) inout 613 { 614 assert(_f + i < _b,text("requested submatch number ", i," is out of range")); 615 return getMatch(_f + i); 616 } 617 618 /++ 619 Explicit cast to bool. 620 Useful as a shorthand for !(x.empty) in if and assert statements. 621 622 --- 623 import std.regex; 624 625 assert(!matchFirst("nothing", "something")); 626 --- 627 +/ 628 629 @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; } 630 631 /++ 632 Number of pattern matched counting, where 1 - the first pattern. 633 Returns 0 on no match. 634 +/ 635 636 @safe @property int whichPattern() const nothrow { return _nMatch; } 637 638 /// 639 @system unittest 640 { 641 import std.regex; 642 assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2); 643 } 644 645 /++ 646 Lookup named submatch. 647 648 --- 649 import std.regex; 650 import std.range; 651 652 auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`)); 653 assert(c["var"] == "a"); 654 assert(c["value"] == "42"); 655 popFrontN(c, 2); 656 //named groups are unaffected by range primitives 657 assert(c["var"] =="a"); 658 assert(c.front == "42"); 659 ---- 660 +/ 661 R opIndex(String)(String i) /*const*/ //@@@BUG@@@ 662 if (isSomeString!String) 663 { 664 size_t index = lookupNamedGroup(_names, i); 665 return getMatch(index); 666 } 667 668 ///Number of matches in this object. 669 @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; } 670 671 ///A hook for compatibility with original std.regex. 672 @property ref captures(){ return this; } 673 } 674 675 /// 676 @system unittest 677 { 678 import std.range.primitives : popFrontN; 679 680 auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`)); 681 assert(c.pre == "@"); // Part of input preceding match 682 assert(c.post == "#"); // Immediately after match 683 assert(c.hit == c[0] && c.hit == "abc"); // The whole match 684 assert(c[2] == "b"); 685 assert(c.front == "abc"); 686 c.popFront(); 687 assert(c.front == "a"); 688 assert(c.back == "c"); 689 c.popBack(); 690 assert(c.back == "b"); 691 popFrontN(c, 2); 692 assert(c.empty); 693 694 assert(!matchFirst("nothing", "something")); 695 696 // Captures that are not matched will be null. 697 c = matchFirst("ac", regex(`a(b)?c`)); 698 assert(c); 699 assert(!c[1]); 700 } 701 702 @system unittest 703 { 704 Captures!string c; 705 string s = "abc"; 706 assert(cast(bool)(c = matchFirst(s, regex("d"))) 707 || cast(bool)(c = matchFirst(s, regex("a")))); 708 } 709 710 // https://issues.dlang.org/show_bug.cgi?id=19979 711 @system unittest 712 { 713 auto c = matchFirst("bad", regex(`(^)(not )?bad($)`)); 714 assert(c[0] && c[0].length == "bad".length); 715 assert(c[1] && !c[1].length); 716 assert(!c[2]); 717 assert(c[3] && !c[3].length); 718 } 719 720 /++ 721 A regex engine state, as returned by `match` family of functions. 722 723 Effectively it's a forward range of Captures!R, produced 724 by lazily searching for matches in a given input. 725 +/ 726 @trusted public struct RegexMatch(R) 727 if (isSomeString!R) 728 { 729 import std.typecons : Rebindable; 730 private: 731 alias Char = BasicElementOf!R; 732 Matcher!Char _engine; 733 Rebindable!(const MatcherFactory!Char) _factory; 734 R _input; 735 Captures!R _captures; 736 737 this(RegEx)(R input, RegEx prog) 738 { 739 import std.exception : enforce; 740 _input = input; 741 if (prog.factory is null) _factory = defaultFactory!Char(prog); 742 else _factory = prog.factory; 743 _engine = _factory.create(prog, input); 744 assert(_engine.refCount == 1); 745 _captures = Captures!R(this); 746 _captures.matches.mutate((slice) { _captures._nMatch = _engine.match(slice); }); 747 } 748 749 public: 750 this(this) 751 { 752 if (_engine) _factory.incRef(_engine); 753 } 754 755 ~this() 756 { 757 if (_engine) _factory.decRef(_engine); 758 } 759 760 ///Shorthands for front.pre, front.post, front.hit. 761 @property R pre() 762 { 763 return _captures.pre; 764 } 765 766 ///ditto 767 @property R post() 768 { 769 return _captures.post; 770 } 771 772 ///ditto 773 @property R hit() 774 { 775 return _captures.hit; 776 } 777 778 /++ 779 Functionality for processing subsequent matches of global regexes via range interface: 780 --- 781 import std.regex; 782 auto m = matchAll("Hello, world!", regex(`\w+`)); 783 assert(m.front.hit == "Hello"); 784 m.popFront(); 785 assert(m.front.hit == "world"); 786 m.popFront(); 787 assert(m.empty); 788 --- 789 +/ 790 @property inout(Captures!R) front() inout 791 { 792 return _captures; 793 } 794 795 ///ditto 796 void popFront() 797 { 798 import std.exception : enforce; 799 // CoW - if refCount is not 1, we are aliased by somebody else 800 if (_engine.refCount != 1) 801 { 802 // we create a new engine & abandon this reference 803 auto old = _engine; 804 _engine = _factory.dup(old, _input); 805 _factory.decRef(old); 806 } 807 _captures.matches.mutate((slice) { _captures._nMatch = _engine.match(slice); }); 808 } 809 810 ///ditto 811 auto save(){ return this; } 812 813 ///Test if this match object is empty. 814 @property bool empty() const { return _captures._nMatch == 0; } 815 816 ///Same as !(x.empty), provided for its convenience in conditional statements. 817 T opCast(T:bool)(){ return !empty; } 818 819 /// Same as .front, provided for compatibility with original std.regex. 820 @property inout(Captures!R) captures() inout { return _captures; } 821 } 822 823 private @trusted auto matchOnce(RegEx, R)(R input, const auto ref RegEx prog) 824 { 825 alias Char = BasicElementOf!R; 826 static struct Key 827 { 828 immutable(Char)[] pattern; 829 uint flags; 830 } 831 static Key cacheKey = Key("", -1); 832 static Matcher!Char cache; 833 auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory; 834 auto key = Key(prog.pattern, prog.flags); 835 Matcher!Char engine; 836 if (cacheKey == key) 837 { 838 engine = cache; 839 engine.rearm(input); 840 } 841 else 842 { 843 engine = factory.create(prog, input); 844 if (cache) factory.decRef(cache); // destroy cached engine *after* building a new one 845 cache = engine; 846 cacheKey = key; 847 } 848 auto captures = Captures!R(input, prog.ngroup, prog.dict); 849 captures.matches.mutate((slice){ captures._nMatch = engine.match(slice); }); 850 return captures; 851 } 852 853 private auto matchMany(RegEx, R)(R input, auto ref RegEx re) @safe 854 { 855 return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global)); 856 } 857 858 @system unittest 859 { 860 //sanity checks for new API 861 auto re = regex("abc"); 862 assert(!"abc".matchOnce(re).empty); 863 assert("abc".matchOnce(re)[0] == "abc"); 864 } 865 866 // https://issues.dlang.org/show_bug.cgi?id=18135 867 @system unittest 868 { 869 static struct MapResult { RegexMatch!string m; } 870 MapResult m; 871 m = MapResult(); 872 assert(m == m); 873 } 874 875 private enum isReplaceFunctor(alias fun, R) = 876 __traits(compiles, (Captures!R c) { fun(c); }); 877 878 // the lowest level - just stuff replacements into the sink 879 private @trusted void replaceCapturesInto(alias output, Sink, R, T) 880 (ref Sink sink, R input, T captures) 881 if (isOutputRange!(Sink, dchar) && isSomeString!R) 882 { 883 if (captures.empty) 884 { 885 sink.put(input); 886 return; 887 } 888 sink.put(captures.pre); 889 // a hack to get around bogus errors, should be simply output(captures, sink) 890 // "is a nested function and cannot be accessed from" 891 static if (isReplaceFunctor!(output, R)) 892 sink.put(output(captures)); //"mutator" type of function 893 else 894 output(captures, sink); //"output" type of function 895 sink.put(captures.post); 896 } 897 898 // ditto for a range of captures 899 private void replaceMatchesInto(alias output, Sink, R, T) 900 (ref Sink sink, R input, T matches) 901 if (isOutputRange!(Sink, dchar) && isSomeString!R) 902 { 903 size_t offset = 0; 904 foreach (cap; matches) 905 { 906 sink.put(cap.pre[offset .. $]); 907 // same hack, see replaceCapturesInto 908 static if (isReplaceFunctor!(output, R)) 909 sink.put(output(cap)); //"mutator" type of function 910 else 911 output(cap, sink); //"output" type of function 912 offset = cap.pre.length + cap.hit.length; 913 } 914 sink.put(input[offset .. $]); 915 } 916 917 // a general skeleton of replaceFirst 918 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) 919 if (isSomeString!R && isRegexFor!(RegEx, R)) 920 { 921 import std.array : appender; 922 auto data = matchFirst(input, re); 923 if (data.empty) 924 return input; 925 auto app = appender!(R)(); 926 replaceCapturesInto!output(app, input, data); 927 return app.data; 928 } 929 930 // ditto for replaceAll 931 // the method parameter allows old API to ride on the back of the new one 932 private R replaceAllWith(alias output, 933 alias method=matchAll, R, RegEx)(R input, RegEx re) 934 if (isSomeString!R && isRegexFor!(RegEx, R)) 935 { 936 import std.array : appender; 937 auto matches = method(input, re); //inout(C)[] fails 938 if (matches.empty) 939 return input; 940 auto app = appender!(R)(); 941 replaceMatchesInto!output(app, input, matches); 942 return app.data; 943 } 944 945 946 /++ 947 Start matching `input` to regex pattern `re`, 948 using Thompson NFA matching scheme. 949 950 The use of this function is $(RED discouraged) - use either of 951 $(LREF matchAll) or $(LREF matchFirst). 952 953 Delegating the kind of operation 954 to "g" flag is soon to be phased out along with the 955 ability to choose the exact matching scheme. The choice of 956 matching scheme to use depends highly on the pattern kind and 957 can done automatically on case by case basis. 958 959 Returns: a `RegexMatch` object holding engine state after first match. 960 +/ 961 962 public auto match(R, RegEx)(R input, RegEx re) 963 if (isSomeString!R && isRegexFor!(RegEx,R)) 964 { 965 return RegexMatch!(Unqual!(typeof(input)))(input, re); 966 } 967 968 ///ditto 969 public auto match(R, String)(R input, String re) 970 if (isSomeString!R && isSomeString!String) 971 { 972 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 973 } 974 975 /++ 976 Find the first (leftmost) slice of the `input` that 977 matches the pattern `re`. This function picks the most suitable 978 regular expression engine depending on the pattern properties. 979 980 `re` parameter can be one of three types: 981 $(UL 982 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 983 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 984 compiled bytecode. ) 985 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 986 compiled native machine code. ) 987 ) 988 989 Returns: 990 $(LREF Captures) containing the extent of a match together with all submatches 991 if there was a match, otherwise an empty $(LREF Captures) object. 992 +/ 993 public auto matchFirst(R, RegEx)(R input, RegEx re) 994 if (isSomeString!R && isRegexFor!(RegEx, R)) 995 { 996 return matchOnce(input, re); 997 } 998 999 ///ditto 1000 public auto matchFirst(R, String)(R input, String re) 1001 if (isSomeString!R && isSomeString!String) 1002 { 1003 return matchOnce(input, regex(re)); 1004 } 1005 1006 ///ditto 1007 public auto matchFirst(R, String)(R input, String[] re...) 1008 if (isSomeString!R && isSomeString!String) 1009 { 1010 return matchOnce(input, regex(re)); 1011 } 1012 1013 /++ 1014 Initiate a search for all non-overlapping matches to the pattern `re` 1015 in the given `input`. The result is a lazy range of matches generated 1016 as they are encountered in the input going left to right. 1017 1018 This function picks the most suitable regular expression engine 1019 depending on the pattern properties. 1020 1021 `re` parameter can be one of three types: 1022 $(UL 1023 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1024 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1025 compiled bytecode. ) 1026 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1027 compiled native machine code. ) 1028 ) 1029 1030 Returns: 1031 $(LREF RegexMatch) object that represents matcher state 1032 after the first match was found or an empty one if not present. 1033 +/ 1034 public auto matchAll(R, RegEx)(R input, RegEx re) 1035 if (isSomeString!R && isRegexFor!(RegEx, R)) 1036 { 1037 return matchMany(input, re); 1038 } 1039 1040 ///ditto 1041 public auto matchAll(R, String)(R input, String re) 1042 if (isSomeString!R && isSomeString!String) 1043 { 1044 return matchMany(input, regex(re)); 1045 } 1046 1047 ///ditto 1048 public auto matchAll(R, String)(R input, String[] re...) 1049 if (isSomeString!R && isSomeString!String) 1050 { 1051 return matchMany(input, regex(re)); 1052 } 1053 1054 // another set of tests just to cover the new API 1055 @system unittest 1056 { 1057 import std.algorithm.comparison : equal; 1058 import std.algorithm.iteration : map; 1059 import std.conv : to; 1060 1061 static foreach (String; AliasSeq!(string, wstring, const(dchar)[])) 1062 {{ 1063 auto str1 = "blah-bleh".to!String(); 1064 auto pat1 = "bl[ae]h".to!String(); 1065 auto mf = matchFirst(str1, pat1); 1066 assert(mf.equal(["blah".to!String()])); 1067 auto mAll = matchAll(str1, pat1); 1068 assert(mAll.equal!((a,b) => a.equal(b)) 1069 ([["blah".to!String()], ["bleh".to!String()]])); 1070 1071 auto str2 = "1/03/12 - 3/03/12".to!String(); 1072 auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); 1073 auto mf2 = matchFirst(str2, pat2); 1074 assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); 1075 auto mAll2 = matchAll(str2, pat2); 1076 assert(mAll2.front.equal(mf2)); 1077 mAll2.popFront(); 1078 assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)())); 1079 mf2.popFrontN(3); 1080 assert(mf2.equal(["12".to!String()])); 1081 1082 auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String()); 1083 auto str = "2 + 34/56 - 6/1".to!String(); 1084 auto cmf = matchFirst(str, ctPat); 1085 assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); 1086 assert(cmf["Quot"] == "34".to!String()); 1087 assert(cmf["Denom"] == "56".to!String()); 1088 1089 auto cmAll = matchAll(str, ctPat); 1090 assert(cmAll.front.equal(cmf)); 1091 cmAll.popFront(); 1092 assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)())); 1093 }} 1094 } 1095 1096 /++ 1097 Start matching of `input` to regex pattern `re`, 1098 using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking, 1099 backtracking) matching scheme. 1100 1101 The use of this function is $(RED discouraged) - use either of 1102 $(LREF matchAll) or $(LREF matchFirst). 1103 1104 Delegating the kind of operation 1105 to "g" flag is soon to be phased out along with the 1106 ability to choose the exact matching scheme. The choice of 1107 matching scheme to use depends highly on the pattern kind and 1108 can done automatically on case by case basis. 1109 1110 Returns: a `RegexMatch` object holding engine 1111 state after first match. 1112 1113 +/ 1114 public auto bmatch(R, RegEx)(R input, RegEx re) 1115 if (isSomeString!R && isRegexFor!(RegEx, R)) 1116 { 1117 return RegexMatch!(Unqual!(typeof(input)))(input, re); 1118 } 1119 1120 ///ditto 1121 public auto bmatch(R, String)(R input, String re) 1122 if (isSomeString!R && isSomeString!String) 1123 { 1124 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 1125 } 1126 1127 // produces replacement string from format using captures for substitution 1128 package void replaceFmt(R, Capt, OutR) 1129 (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) 1130 if (isOutputRange!(OutR, ElementEncodingType!R[]) && 1131 isOutputRange!(OutR, ElementEncodingType!(Capt.String)[])) 1132 { 1133 import std.algorithm.searching : find; 1134 import std.ascii : isDigit, isAlpha; 1135 import std.conv : text, parse; 1136 import std.exception : enforce; 1137 enum State { Normal, Dollar } 1138 auto state = State.Normal; 1139 size_t offset; 1140 L_Replace_Loop: 1141 while (!format.empty) 1142 final switch (state) 1143 { 1144 case State.Normal: 1145 for (offset = 0; offset < format.length; offset++)//no decoding 1146 { 1147 if (format[offset] == '$') 1148 { 1149 state = State.Dollar; 1150 sink.put(format[0 .. offset]); 1151 format = format[offset+1 .. $];//ditto 1152 continue L_Replace_Loop; 1153 } 1154 } 1155 sink.put(format[0 .. offset]); 1156 format = format[offset .. $]; 1157 break; 1158 case State.Dollar: 1159 if (isDigit(format[0])) 1160 { 1161 uint digit = parse!uint(format); 1162 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit)); 1163 if (digit < captures.length) 1164 sink.put(captures[digit]); 1165 } 1166 else if (format[0] == '{') 1167 { 1168 auto x = find!(a => !isAlpha(a))(format[1..$]); 1169 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format"); 1170 auto name = format[1 .. $ - x.length]; 1171 format = x[1..$]; 1172 enforce(!name.empty, "invalid name in ${...} replacement format"); 1173 sink.put(captures[name]); 1174 } 1175 else if (format[0] == '&') 1176 { 1177 sink.put(captures[0]); 1178 format = format[1 .. $]; 1179 } 1180 else if (format[0] == '`') 1181 { 1182 sink.put(captures.pre); 1183 format = format[1 .. $]; 1184 } 1185 else if (format[0] == '\'') 1186 { 1187 sink.put(captures.post); 1188 format = format[1 .. $]; 1189 } 1190 else if (format[0] == '$') 1191 { 1192 sink.put(format[0 .. 1]); 1193 format = format[1 .. $]; 1194 } 1195 state = State.Normal; 1196 break; 1197 } 1198 enforce(state == State.Normal, "invalid format string in regex replace"); 1199 } 1200 1201 /++ 1202 Construct a new string from `input` by replacing the first match with 1203 a string generated from it according to the `format` specifier. 1204 1205 To replace all matches use $(LREF replaceAll). 1206 1207 Params: 1208 input = string to search 1209 re = compiled regular expression to use 1210 format = _format string to generate replacements from, 1211 see $(S_LINK Replace _format string, the _format string). 1212 1213 Returns: 1214 A string of the same type with the first match (if any) replaced. 1215 If no match is found returns the input string itself. 1216 +/ 1217 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1218 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1219 { 1220 return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1221 } 1222 1223 /// 1224 @system unittest 1225 { 1226 assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon"); 1227 } 1228 1229 /++ 1230 This is a general replacement tool that construct a new string by replacing 1231 matches of pattern `re` in the `input`. Unlike the other overload 1232 there is no format string instead captures are passed to 1233 to a user-defined functor `fun` that returns a new string 1234 to use as replacement. 1235 1236 This version replaces the first match in `input`, 1237 see $(LREF replaceAll) to replace the all of the matches. 1238 1239 Returns: 1240 A new string of the same type as `input` with all matches 1241 replaced by return values of `fun`. If no matches found 1242 returns the `input` itself. 1243 +/ 1244 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) 1245 if (isSomeString!R && isRegexFor!(RegEx, R)) 1246 { 1247 return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); 1248 } 1249 1250 /// 1251 @system unittest 1252 { 1253 import std.conv : to; 1254 string list = "#21 out of 46"; 1255 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1256 (list, regex(`[0-9]+`)); 1257 assert(newList == "#22 out of 46"); 1258 } 1259 1260 /++ 1261 A variation on $(LREF replaceFirst) that instead of allocating a new string 1262 on each call outputs the result piece-wise to the `sink`. In particular 1263 this enables efficient construction of a final output incrementally. 1264 1265 Like in $(LREF replaceFirst) family of functions there is an overload 1266 for the substitution guided by the `format` string 1267 and the one with the user defined callback. 1268 +/ 1269 public @trusted void replaceFirstInto(Sink, R, C, RegEx) 1270 (ref Sink sink, R input, RegEx re, const(C)[] format) 1271 if (isOutputRange!(Sink, dchar) && isSomeString!R 1272 && is(C : dchar) && isRegexFor!(RegEx, R)) 1273 { 1274 replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink)) 1275 (sink, input, matchFirst(input, re)); 1276 } 1277 1278 ///ditto 1279 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) 1280 (Sink sink, R input, RegEx re) 1281 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1282 { 1283 replaceCapturesInto!fun(sink, input, matchFirst(input, re)); 1284 } 1285 1286 /// 1287 @system unittest 1288 { 1289 import std.array; 1290 string m1 = "first message\n"; 1291 string m2 = "second message\n"; 1292 auto result = appender!string(); 1293 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1294 //equivalent of the above with user-defined callback 1295 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1296 assert(result.data == "first\nsecond\n"); 1297 } 1298 1299 //examples for replaceFirst 1300 @system unittest 1301 { 1302 import std.conv; 1303 string list = "#21 out of 46"; 1304 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1305 (list, regex(`[0-9]+`)); 1306 assert(newList == "#22 out of 46"); 1307 import std.array; 1308 string m1 = "first message\n"; 1309 string m2 = "second message\n"; 1310 auto result = appender!string(); 1311 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1312 //equivalent of the above with user-defined callback 1313 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1314 assert(result.data == "first\nsecond\n"); 1315 } 1316 1317 /++ 1318 Construct a new string from `input` by replacing all of the 1319 fragments that match a pattern `re` with a string generated 1320 from the match according to the `format` specifier. 1321 1322 To replace only the first match use $(LREF replaceFirst). 1323 1324 Params: 1325 input = string to search 1326 re = compiled regular expression to use 1327 format = _format string to generate replacements from, 1328 see $(S_LINK Replace _format string, the _format string). 1329 1330 Returns: 1331 A string of the same type as `input` with the all 1332 of the matches (if any) replaced. 1333 If no match is found returns the input string itself. 1334 +/ 1335 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1336 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1337 { 1338 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1339 } 1340 1341 /// 1342 @system unittest 1343 { 1344 // insert comma as thousands delimiter 1345 auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); 1346 assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); 1347 } 1348 1349 /++ 1350 This is a general replacement tool that construct a new string by replacing 1351 matches of pattern `re` in the `input`. Unlike the other overload 1352 there is no format string instead captures are passed to 1353 to a user-defined functor `fun` that returns a new string 1354 to use as replacement. 1355 1356 This version replaces all of the matches found in `input`, 1357 see $(LREF replaceFirst) to replace the first match only. 1358 1359 Returns: 1360 A new string of the same type as `input` with all matches 1361 replaced by return values of `fun`. If no matches found 1362 returns the `input` itself. 1363 1364 Params: 1365 input = string to search 1366 re = compiled regular expression 1367 fun = delegate to use 1368 +/ 1369 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) 1370 if (isSomeString!R && isRegexFor!(RegEx, R)) 1371 { 1372 return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); 1373 } 1374 1375 /// 1376 @system unittest 1377 { 1378 string baz(Captures!(string) m) 1379 { 1380 import std..string : toUpper; 1381 return toUpper(m.hit); 1382 } 1383 // Capitalize the letters 'a' and 'r': 1384 auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.", 1385 regex("[ar]")); 1386 assert(s == "StRAp A Rocket engine on A chicken."); 1387 } 1388 1389 /++ 1390 A variation on $(LREF replaceAll) that instead of allocating a new string 1391 on each call outputs the result piece-wise to the `sink`. In particular 1392 this enables efficient construction of a final output incrementally. 1393 1394 As with $(LREF replaceAll) there are 2 overloads - one with a format string, 1395 the other one with a user defined functor. 1396 +/ 1397 public @trusted void replaceAllInto(Sink, R, C, RegEx) 1398 (Sink sink, R input, RegEx re, const(C)[] format) 1399 if (isOutputRange!(Sink, dchar) && isSomeString!R 1400 && is(C : dchar) && isRegexFor!(RegEx, R)) 1401 { 1402 replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink)) 1403 (sink, input, matchAll(input, re)); 1404 } 1405 1406 ///ditto 1407 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) 1408 (Sink sink, R input, RegEx re) 1409 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1410 { 1411 replaceMatchesInto!fun(sink, input, matchAll(input, re)); 1412 } 1413 1414 /// 1415 @system unittest 1416 { 1417 // insert comma as thousands delimiter in fifty randomly produced big numbers 1418 import std.array, std.conv, std.random, std.range; 1419 static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g"); 1420 auto sink = appender!(char [])(); 1421 enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19; 1422 foreach (i; 0 .. 50) 1423 { 1424 sink.clear(); 1425 replaceAllInto(sink, text(uniform(min, max)), re, ","); 1426 foreach (pos; iota(sink.data.length - 4, 0, -4)) 1427 assert(sink.data[pos] == ','); 1428 } 1429 } 1430 1431 // exercise all of the replace APIs 1432 @system unittest 1433 { 1434 import std.array : appender; 1435 import std.conv; 1436 // try and check first/all simple substitution 1437 static foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[])) 1438 {{ 1439 S s1 = "curt trial".to!S(); 1440 S s2 = "round dome".to!S(); 1441 S t1F = "court trial".to!S(); 1442 S t2F = "hound dome".to!S(); 1443 S t1A = "court trial".to!S(); 1444 S t2A = "hound home".to!S(); 1445 auto re1 = regex("curt".to!S()); 1446 auto re2 = regex("[dr]o".to!S()); 1447 1448 assert(replaceFirst(s1, re1, "court") == t1F); 1449 assert(replaceFirst(s2, re2, "ho") == t2F); 1450 assert(replaceAll(s1, re1, "court") == t1A); 1451 assert(replaceAll(s2, re2, "ho") == t2A); 1452 1453 auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1454 assert(rep1 == t1F); 1455 assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F); 1456 auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1457 assert(rep1A == t1A); 1458 assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A); 1459 1460 auto sink = appender!S(); 1461 replaceFirstInto(sink, s1, re1, "court"); 1462 assert(sink.data == t1F); 1463 replaceFirstInto(sink, s2, re2, "ho"); 1464 assert(sink.data == t1F~t2F); 1465 replaceAllInto(sink, s1, re1, "court"); 1466 assert(sink.data == t1F~t2F~t1A); 1467 replaceAllInto(sink, s2, re2, "ho"); 1468 assert(sink.data == t1F~t2F~t1A~t2A); 1469 }} 1470 } 1471 1472 /++ 1473 Old API for replacement, operation depends on flags of pattern `re`. 1474 With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it 1475 works the same as $(LREF replaceFirst). 1476 1477 The use of this function is $(RED discouraged), please use $(LREF replaceAll) 1478 or $(LREF replaceFirst) explicitly. 1479 +/ 1480 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) 1481 if (isSomeString!R && isRegexFor!(RegEx, R)) 1482 { 1483 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); 1484 } 1485 1486 ///ditto 1487 public R replace(alias fun, R, RegEx)(R input, RegEx re) 1488 if (isSomeString!R && isRegexFor!(RegEx, R)) 1489 { 1490 return replaceAllWith!(fun, match)(input, re); 1491 } 1492 1493 /** 1494 Splits a string `r` using a regular expression `pat` as a separator. 1495 1496 Params: 1497 keepSeparators = flag to specify if the matches should be in the resulting range 1498 r = the string to split 1499 pat = the pattern to split on 1500 Returns: 1501 A lazy range of strings 1502 */ 1503 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex) 1504 if (isSomeString!Range && isRegexFor!(RegEx, Range)) 1505 { 1506 private: 1507 Range _input; 1508 size_t _offset; 1509 alias Rx = typeof(match(Range.init,RegEx.init)); 1510 Rx _match; 1511 1512 static if (keepSeparators) bool onMatch = false; 1513 1514 @trusted this(Range input, RegEx separator) 1515 {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted 1516 _input = input; 1517 const re = separator.withFlags(separator.flags | RegexOption.global); 1518 if (_input.empty) 1519 { 1520 //there is nothing to match at all, make _offset > 0 1521 _offset = 1; 1522 } 1523 else 1524 { 1525 _match = Rx(_input, re); 1526 1527 static if (keepSeparators) 1528 if (_match.pre.empty) 1529 popFront(); 1530 } 1531 } 1532 1533 public: 1534 auto ref opSlice() 1535 { 1536 return this.save; 1537 } 1538 1539 ///Forward range primitives. 1540 @property Range front() 1541 { 1542 import std.algorithm.comparison : min; 1543 1544 assert(!empty && _offset <= _match.pre.length 1545 && _match.pre.length <= _input.length); 1546 1547 static if (keepSeparators) 1548 { 1549 if (!onMatch) 1550 return _input[_offset .. min($, _match.pre.length)]; 1551 else 1552 return _match.hit(); 1553 } 1554 else 1555 { 1556 return _input[_offset .. min($, _match.pre.length)]; 1557 } 1558 } 1559 1560 ///ditto 1561 @property bool empty() 1562 { 1563 static if (keepSeparators) 1564 return _offset >= _input.length; 1565 else 1566 return _offset > _input.length; 1567 } 1568 1569 ///ditto 1570 void popFront() 1571 { 1572 assert(!empty); 1573 if (_match.empty) 1574 { 1575 //No more separators, work is done here 1576 _offset = _input.length + 1; 1577 } 1578 else 1579 { 1580 static if (keepSeparators) 1581 { 1582 if (!onMatch) 1583 { 1584 //skip past the separator 1585 _offset = _match.pre.length; 1586 } 1587 else 1588 { 1589 _offset += _match.hit.length; 1590 _match.popFront(); 1591 } 1592 1593 onMatch = !onMatch; 1594 } 1595 else 1596 { 1597 //skip past the separator 1598 _offset = _match.pre.length + _match.hit.length; 1599 _match.popFront(); 1600 } 1601 } 1602 } 1603 1604 ///ditto 1605 @property auto save() 1606 { 1607 return this; 1608 } 1609 } 1610 1611 /// ditto 1612 public Splitter!(keepSeparators, Range, RegEx) splitter( 1613 Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) 1614 if ( 1615 is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) 1616 { 1617 return Splitter!(keepSeparators, Range, RegEx)(r, pat); 1618 } 1619 1620 /// 1621 @system unittest 1622 { 1623 import std.algorithm.comparison : equal; 1624 auto s1 = ", abc, de, fg, hi, "; 1625 assert(equal(splitter(s1, regex(", *")), 1626 ["", "abc", "de", "fg", "hi", ""])); 1627 } 1628 1629 /// Split on a pattern, but keep the matches in the resulting range 1630 @system unittest 1631 { 1632 import std.algorithm.comparison : equal; 1633 import std.typecons : Yes; 1634 1635 auto pattern = regex(`([\.,])`); 1636 1637 assert("2003.04.05" 1638 .splitter!(Yes.keepSeparators)(pattern) 1639 .equal(["2003", ".", "04", ".", "05"])); 1640 1641 assert(",1,2,3" 1642 .splitter!(Yes.keepSeparators)(pattern) 1643 .equal([",", "1", ",", "2", ",", "3"])); 1644 } 1645 1646 ///An eager version of `splitter` that creates an array with splitted slices of `input`. 1647 public @trusted String[] split(String, RegEx)(String input, RegEx rx) 1648 if (isSomeString!String && isRegexFor!(RegEx, String)) 1649 { 1650 import std.array : appender; 1651 auto a = appender!(String[])(); 1652 foreach (e; splitter(input, rx)) 1653 a.put(e); 1654 return a.data; 1655 } 1656 1657 ///Exception object thrown in case of errors during regex compilation. 1658 public alias RegexException = std.regex.internal.ir.RegexException; 1659 1660 /++ 1661 A range that lazily produces a string output escaped 1662 to be used inside of a regular expression. 1663 +/ 1664 auto escaper(Range)(Range r) 1665 { 1666 import std.algorithm.searching : find; 1667 static immutable escapables = [Escapables]; 1668 static struct Escaper // template to deduce attributes 1669 { 1670 Range r; 1671 bool escaped; 1672 1673 @property ElementType!Range front(){ 1674 if (escaped) 1675 return '\\'; 1676 else 1677 return r.front; 1678 } 1679 1680 @property bool empty(){ return r.empty; } 1681 1682 void popFront(){ 1683 if (escaped) escaped = false; 1684 else 1685 { 1686 r.popFront(); 1687 if (!r.empty && !escapables.find(r.front).empty) 1688 escaped = true; 1689 } 1690 } 1691 1692 @property auto save(){ return Escaper(r.save, escaped); } 1693 } 1694 1695 bool escaped = !r.empty && !escapables.find(r.front).empty; 1696 return Escaper(r, escaped); 1697 } 1698 1699 /// 1700 @system unittest 1701 { 1702 import std.algorithm.comparison; 1703 import std.regex; 1704 string s = `This is {unfriendly} to *regex*`; 1705 assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`)); 1706 } 1707 1708 @system unittest 1709 { 1710 import std.algorithm.comparison; 1711 import std.conv; 1712 static foreach (S; AliasSeq!(string, wstring, dstring)) 1713 {{ 1714 auto s = "^".to!S; 1715 assert(s.escaper.equal(`\^`)); 1716 auto s2 = ""; 1717 assert(s2.escaper.equal("")); 1718 }} 1719 }