1 // Written in the D programming language. 2 3 /++ 4 Encode and decode UTF-8, UTF-16 and UTF-32 strings. 5 6 UTF character support is restricted to 7 $(D '\u0000' <= character <= '\U0010FFFF'). 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Decode) $(TD 14 $(LREF decode) 15 $(LREF decodeFront) 16 )) 17 $(TR $(TD Lazy decode) $(TD 18 $(LREF byCodeUnit) 19 $(LREF byChar) 20 $(LREF byWchar) 21 $(LREF byDchar) 22 $(LREF byUTF) 23 )) 24 $(TR $(TD Encode) $(TD 25 $(LREF encode) 26 $(LREF toUTF8) 27 $(LREF toUTF16) 28 $(LREF toUTF32) 29 $(LREF toUTFz) 30 $(LREF toUTF16z) 31 )) 32 $(TR $(TD Length) $(TD 33 $(LREF codeLength) 34 $(LREF count) 35 $(LREF stride) 36 $(LREF strideBack) 37 )) 38 $(TR $(TD Index) $(TD 39 $(LREF toUCSindex) 40 $(LREF toUTFindex) 41 )) 42 $(TR $(TD Validation) $(TD 43 $(LREF isValidDchar) 44 $(LREF validate) 45 )) 46 $(TR $(TD Miscellaneous) $(TD 47 $(LREF replacementDchar) 48 $(LREF UseReplacementDchar) 49 $(LREF UTFException) 50 )) 51 )) 52 See_Also: 53 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 54 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 55 $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 56 Copyright: Copyright The D Language Foundation 2000 - 2012. 57 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 58 Authors: $(HTTP digitalmars.com, Walter Bright) and 59 $(HTTP jmdavisprog.com, Jonathan M Davis) 60 Source: $(PHOBOSSRC std/utf.d) 61 +/ 62 module std.utf; 63 64 import std.exception : basicExceptionCtors; 65 import core.exception : UnicodeException; 66 import std.meta : AliasSeq; 67 import std.range.primitives; 68 import std.traits : isAutodecodableString, isPointer, isSomeChar, 69 isSomeString, isStaticArray, Unqual, isConvertibleToString; 70 import std.typecons : Flag, Yes, No; 71 72 73 /++ 74 Exception thrown on errors in std.utf functions. 75 +/ 76 class UTFException : UnicodeException 77 { 78 import core.internal..string : unsignedToTempString, UnsignedStringBuf; 79 80 uint[4] sequence; 81 size_t len; 82 83 @safe pure nothrow @nogc 84 UTFException setSequence(scope uint[] data...) 85 { 86 assert(data.length <= 4); 87 88 len = data.length < 4 ? data.length : 4; 89 sequence[0 .. len] = data[0 .. len]; 90 91 return this; 92 } 93 94 // FIXME: Use std.exception.basicExceptionCtors here once 95 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed 96 97 /** 98 Standard exception constructors. 99 */ 100 this(string msg, string file = __FILE__, size_t line = __LINE__, 101 Throwable next = null) @nogc @safe pure nothrow 102 { 103 super(msg, 0, file, line, next); 104 } 105 /// ditto 106 this(string msg, size_t index, string file = __FILE__, 107 size_t line = __LINE__, Throwable next = null) @safe pure nothrow 108 { 109 UnsignedStringBuf buf = void; 110 msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")"; 111 super(msg, index, file, line, next); 112 } 113 114 /** 115 Returns: 116 A `string` detailing the invalid UTF sequence. 117 */ 118 override string toString() const 119 { 120 if (len == 0) 121 { 122 /* Exception.toString() is not marked as const, although 123 * it is const-compatible. 124 */ 125 //return super.toString(); 126 auto e = () @trusted { return cast(Exception) super; } (); 127 return e.toString(); 128 } 129 130 string result = "Invalid UTF sequence:"; 131 132 foreach (i; sequence[0 .. len]) 133 { 134 UnsignedStringBuf buf = void; 135 result ~= ' '; 136 auto h = unsignedToTempString!16(i, buf); 137 if (h.length == 1) 138 result ~= '0'; 139 result ~= h; 140 result ~= 'x'; 141 } 142 143 if (super.msg.length > 0) 144 { 145 result ~= " - "; 146 result ~= super.msg; 147 } 148 149 return result; 150 } 151 } 152 153 /// 154 @safe unittest 155 { 156 import std.exception : assertThrown; 157 158 char[4] buf; 159 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 160 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 161 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 162 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 163 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 164 } 165 166 /* 167 Provide array of invalidly encoded UTF strings. Useful for testing. 168 169 Params: 170 Char = char, wchar, or dchar 171 172 Returns: 173 an array of invalidly encoded UTF strings 174 */ 175 176 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow 177 if (isSomeChar!Char) 178 { 179 static if (is(Char == char)) 180 { 181 enum x = 0xDC00; // invalid surrogate value 182 enum y = 0x110000; // out of range 183 184 static immutable string[8] result = 185 [ 186 "\x80", // not a start byte 187 "\xC0", // truncated 188 "\xC0\xC0", // invalid continuation 189 "\xF0\x82\x82\xAC", // overlong 190 [ 191 0xE0 | (x >> 12), 192 0x80 | ((x >> 6) & 0x3F), 193 0x80 | (x & 0x3F) 194 ], 195 [ 196 cast(char)(0xF0 | (y >> 18)), 197 cast(char)(0x80 | ((y >> 12) & 0x3F)), 198 cast(char)(0x80 | ((y >> 6) & 0x3F)), 199 cast(char)(0x80 | (y & 0x3F)) 200 ], 201 [ 202 cast(char)(0xF8 | 3), // 5 byte encoding 203 cast(char)(0x80 | 3), 204 cast(char)(0x80 | 3), 205 cast(char)(0x80 | 3), 206 cast(char)(0x80 | 3), 207 ], 208 [ 209 cast(char)(0xFC | 3), // 6 byte encoding 210 cast(char)(0x80 | 3), 211 cast(char)(0x80 | 3), 212 cast(char)(0x80 | 3), 213 cast(char)(0x80 | 3), 214 cast(char)(0x80 | 3), 215 ], 216 ]; 217 218 return result[]; 219 } 220 else static if (is(Char == wchar)) 221 { 222 static immutable wstring[5] result = 223 [ 224 [ 225 cast(wchar) 0xDC00, 226 ], 227 [ 228 cast(wchar) 0xDFFF, 229 ], 230 [ 231 cast(wchar) 0xDBFF, 232 cast(wchar) 0xDBFF, 233 ], 234 [ 235 cast(wchar) 0xDBFF, 236 cast(wchar) 0xE000, 237 ], 238 [ 239 cast(wchar) 0xD800, 240 ], 241 ]; 242 243 return result[]; 244 } 245 else static if (is(Char == dchar)) 246 { 247 static immutable dstring[3] result = 248 [ 249 [ cast(dchar) 0x110000 ], 250 [ cast(dchar) 0x00D800 ], 251 [ cast(dchar) 0x00DFFF ], 252 ]; 253 254 return result; 255 } 256 else 257 static assert(0); 258 } 259 260 /++ 261 Check whether the given Unicode code point is valid. 262 263 Params: 264 c = code point to check 265 266 Returns: 267 `true` if and only if `c` is a valid Unicode code point 268 269 Note: 270 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`, 271 as they are permitted for internal use by an application, but they are 272 not allowed for interchange by the Unicode standard. 273 +/ 274 bool isValidDchar(dchar c) pure nothrow @safe @nogc 275 { 276 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 277 } 278 279 /// 280 @safe @nogc pure nothrow unittest 281 { 282 assert( isValidDchar(cast(dchar) 0x41)); 283 assert( isValidDchar(cast(dchar) 0x00)); 284 assert(!isValidDchar(cast(dchar) 0xD800)); 285 assert(!isValidDchar(cast(dchar) 0x11FFFF)); 286 } 287 288 pure nothrow @safe @nogc unittest 289 { 290 import std.exception; 291 292 assertCTFEable!( 293 { 294 assert( isValidDchar(cast(dchar)'a') == true); 295 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false); 296 297 assert(!isValidDchar(cast(dchar) 0x00D800)); 298 assert(!isValidDchar(cast(dchar) 0x00DBFF)); 299 assert(!isValidDchar(cast(dchar) 0x00DC00)); 300 assert(!isValidDchar(cast(dchar) 0x00DFFF)); 301 assert( isValidDchar(cast(dchar) 0x00FFFE)); 302 assert( isValidDchar(cast(dchar) 0x00FFFF)); 303 assert( isValidDchar(cast(dchar) 0x01FFFF)); 304 assert( isValidDchar(cast(dchar) 0x10FFFF)); 305 assert(!isValidDchar(cast(dchar) 0x110000)); 306 }); 307 } 308 309 310 /++ 311 Calculate the length of the UTF sequence starting at `index` 312 in `str`. 313 314 Params: 315 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 316 of UTF code units. Must be random access if `index` is passed 317 index = starting index of UTF sequence (default: `0`) 318 319 Returns: 320 The number of code units in the UTF sequence. For UTF-8, this is a 321 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 322 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 323 324 Throws: 325 May throw a `UTFException` if `str[index]` is not the start of a 326 valid UTF sequence. 327 328 Note: 329 `stride` will only analyze the first `str[index]` element. It 330 will not fully verify the validity of the UTF sequence, nor even verify 331 the presence of the sequence: it will not actually guarantee that 332 $(D index + stride(str, index) <= str.length). 333 +/ 334 uint stride(S)(auto ref S str, size_t index) 335 if (is(S : const char[]) || 336 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 337 { 338 static if (is(typeof(str.length) : ulong)) 339 assert(index < str.length, "Past the end of the UTF-8 sequence"); 340 immutable c = str[index]; 341 342 if (c < 0x80) 343 return 1; 344 else 345 return strideImpl(c, index); 346 } 347 348 /// Ditto 349 uint stride(S)(auto ref S str) 350 if (is(S : const char[]) || 351 (isInputRange!S && is(immutable ElementType!S == immutable char))) 352 { 353 static if (is(S : const char[])) 354 immutable c = str[0]; 355 else 356 immutable c = str.front; 357 358 if (c < 0x80) 359 return 1; 360 else 361 return strideImpl(c, 0); 362 } 363 364 @system unittest 365 { 366 import core.exception : AssertError; 367 import std.conv : to; 368 import std.exception; 369 import std..string : format; 370 import std.traits : FunctionAttribute, functionAttributes, isSafe; 371 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__) 372 { 373 enforce(stride(s, i) == codeLength!char(c), 374 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 375 376 enforce(stride(RandomCU!char(s), i) == codeLength!char(c), 377 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 378 379 auto refRandom = new RefRandomCU!char(s); 380 immutable randLen = refRandom.length; 381 enforce(stride(refRandom, i) == codeLength!char(c), 382 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 383 enforce(refRandom.length == randLen, 384 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 385 386 if (i == 0) 387 { 388 enforce(stride(s) == codeLength!char(c), 389 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 390 391 enforce(stride(InputCU!char(s)) == codeLength!char(c), 392 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 393 394 auto refBidir = new RefBidirCU!char(s); 395 immutable bidirLen = refBidir.length; 396 enforce(stride(refBidir) == codeLength!char(c), 397 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 398 enforce(refBidir.length == bidirLen, 399 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 400 } 401 } 402 403 assertCTFEable!( 404 { 405 test("a", 'a'); 406 test(" ", ' '); 407 test("\u2029", '\u2029'); //paraSep 408 test("\u0100", '\u0100'); 409 test("\u0430", '\u0430'); 410 test("\U00010143", '\U00010143'); 411 test("abcdefcdef", 'a'); 412 test("hello\U00010143\u0100\U00010143", 'h', 0); 413 test("hello\U00010143\u0100\U00010143", 'e', 1); 414 test("hello\U00010143\u0100\U00010143", 'l', 2); 415 test("hello\U00010143\u0100\U00010143", 'l', 3); 416 test("hello\U00010143\u0100\U00010143", 'o', 4); 417 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 418 test("hello\U00010143\u0100\U00010143", '\u0100', 9); 419 test("hello\U00010143\u0100\U00010143", '\U00010143', 11); 420 421 foreach (S; AliasSeq!(char[], const char[], string)) 422 { 423 enum str = to!S("hello world"); 424 static assert(isSafe!({ stride(str, 0); })); 425 static assert(isSafe!({ stride(str); })); 426 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0); 427 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0); 428 } 429 }); 430 } 431 432 @safe unittest // invalid start bytes 433 { 434 import std.exception : assertThrown; 435 immutable char[] invalidStartBytes = [ 436 0b1111_1000, // indicating a sequence length of 5 437 0b1111_1100, // 6 438 0b1111_1110, // 7 439 0b1111_1111, // 8 440 0b1000_0000, // continuation byte 441 ]; 442 foreach (c; invalidStartBytes) 443 assertThrown!UTFException(stride([c])); 444 } 445 446 /// Ditto 447 uint stride(S)(auto ref S str, size_t index) 448 if (is(S : const wchar[]) || 449 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 450 { 451 static if (is(typeof(str.length) : ulong)) 452 assert(index < str.length, "Past the end of the UTF-16 sequence"); 453 immutable uint u = str[index]; 454 return 1 + (u >= 0xD800 && u <= 0xDBFF); 455 } 456 457 /// Ditto 458 uint stride(S)(auto ref S str) @safe pure 459 if (is(S : const wchar[])) 460 { 461 return stride(str, 0); 462 } 463 464 /// Ditto 465 uint stride(S)(auto ref S str) 466 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) && 467 !is(S : const wchar[])) 468 { 469 assert(!str.empty, "UTF-16 sequence is empty"); 470 immutable uint u = str.front; 471 return 1 + (u >= 0xD800 && u <= 0xDBFF); 472 } 473 474 @system unittest 475 { 476 import core.exception : AssertError; 477 import std.conv : to; 478 import std.exception; 479 import std..string : format; 480 import std.traits : FunctionAttribute, functionAttributes, isSafe; 481 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__) 482 { 483 enforce(stride(s, i) == codeLength!wchar(c), 484 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 485 486 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c), 487 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 488 489 auto refRandom = new RefRandomCU!wchar(s); 490 immutable randLen = refRandom.length; 491 enforce(stride(refRandom, i) == codeLength!wchar(c), 492 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 493 enforce(refRandom.length == randLen, 494 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 495 496 if (i == 0) 497 { 498 enforce(stride(s) == codeLength!wchar(c), 499 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 500 501 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c), 502 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 503 504 auto refBidir = new RefBidirCU!wchar(s); 505 immutable bidirLen = refBidir.length; 506 enforce(stride(refBidir) == codeLength!wchar(c), 507 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 508 enforce(refBidir.length == bidirLen, 509 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 510 } 511 } 512 513 assertCTFEable!( 514 { 515 test("a", 'a'); 516 test(" ", ' '); 517 test("\u2029", '\u2029'); //paraSep 518 test("\u0100", '\u0100'); 519 test("\u0430", '\u0430'); 520 test("\U00010143", '\U00010143'); 521 test("abcdefcdef", 'a'); 522 test("hello\U00010143\u0100\U00010143", 'h', 0); 523 test("hello\U00010143\u0100\U00010143", 'e', 1); 524 test("hello\U00010143\u0100\U00010143", 'l', 2); 525 test("hello\U00010143\u0100\U00010143", 'l', 3); 526 test("hello\U00010143\u0100\U00010143", 'o', 4); 527 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 528 test("hello\U00010143\u0100\U00010143", '\u0100', 7); 529 test("hello\U00010143\u0100\U00010143", '\U00010143', 8); 530 531 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 532 { 533 enum str = to!S("hello world"); 534 static assert(isSafe!(() => stride(str, 0))); 535 static assert(isSafe!(() => stride(str) )); 536 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 537 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 538 } 539 }); 540 } 541 542 /// Ditto 543 uint stride(S)(auto ref S str, size_t index = 0) 544 if (is(S : const dchar[]) || 545 (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 546 { 547 static if (is(typeof(str.length) : ulong)) 548 assert(index < str.length, "Past the end of the UTF-32 sequence"); 549 else 550 assert(!str.empty, "UTF-32 sequence is empty."); 551 return 1; 552 } 553 554 /// 555 @safe unittest 556 { 557 assert("a".stride == 1); 558 assert("λ".stride == 2); 559 assert("aλ".stride == 1); 560 assert("aλ".stride(1) == 2); 561 assert("𐐷".stride == 4); 562 } 563 564 @system unittest 565 { 566 import core.exception : AssertError; 567 import std.conv : to; 568 import std.exception; 569 import std..string : format; 570 import std.traits : FunctionAttribute, functionAttributes, isSafe; 571 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__) 572 { 573 enforce(stride(s, i) == codeLength!dchar(c), 574 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 575 576 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c), 577 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 578 579 auto refRandom = new RefRandomCU!dchar(s); 580 immutable randLen = refRandom.length; 581 enforce(stride(refRandom, i) == codeLength!dchar(c), 582 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 583 enforce(refRandom.length == randLen, 584 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 585 586 if (i == 0) 587 { 588 enforce(stride(s) == codeLength!dchar(c), 589 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 590 591 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c), 592 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 593 594 auto refBidir = new RefBidirCU!dchar(s); 595 immutable bidirLen = refBidir.length; 596 enforce(stride(refBidir) == codeLength!dchar(c), 597 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 598 enforce(refBidir.length == bidirLen, 599 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 600 } 601 } 602 603 assertCTFEable!( 604 { 605 test("a", 'a'); 606 test(" ", ' '); 607 test("\u2029", '\u2029'); //paraSep 608 test("\u0100", '\u0100'); 609 test("\u0430", '\u0430'); 610 test("\U00010143", '\U00010143'); 611 test("abcdefcdef", 'a'); 612 test("hello\U00010143\u0100\U00010143", 'h', 0); 613 test("hello\U00010143\u0100\U00010143", 'e', 1); 614 test("hello\U00010143\u0100\U00010143", 'l', 2); 615 test("hello\U00010143\u0100\U00010143", 'l', 3); 616 test("hello\U00010143\u0100\U00010143", 'o', 4); 617 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 618 test("hello\U00010143\u0100\U00010143", '\u0100', 6); 619 test("hello\U00010143\u0100\U00010143", '\U00010143', 7); 620 621 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 622 { 623 enum str = to!S("hello world"); 624 static assert(isSafe!(() => stride(str, 0))); 625 static assert(isSafe!(() => stride(str) )); 626 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 627 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 628 } 629 }); 630 } 631 632 private uint strideImpl(char c, size_t index) @trusted pure 633 in { assert(c & 0x80); } 634 do 635 { 636 import core.bitop : bsr; 637 immutable msbs = 7 - bsr((~uint(c)) & 0xFF); 638 if (c == 0xFF || msbs < 2 || msbs > 4) 639 throw new UTFException("Invalid UTF-8 sequence", index); 640 return msbs; 641 } 642 643 /++ 644 Calculate the length of the UTF sequence ending one code unit before 645 `index` in `str`. 646 647 Params: 648 str = bidirectional range of UTF code units. Must be random access if 649 `index` is passed 650 index = index one past end of UTF sequence (default: `str.length`) 651 652 Returns: 653 The number of code units in the UTF sequence. For UTF-8, this is a 654 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 655 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 656 657 Throws: 658 May throw a `UTFException` if `str[index]` is not one past the 659 end of a valid UTF sequence. 660 661 Note: 662 `strideBack` will only analyze the element at $(D str[index - 1]) 663 element. It will not fully verify the validity of the UTF sequence, nor 664 even verify the presence of the sequence: it will not actually 665 guarantee that $(D strideBack(str, index) <= index). 666 +/ 667 uint strideBack(S)(auto ref S str, size_t index) 668 if (is(S : const char[]) || 669 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 670 { 671 static if (is(typeof(str.length) : ulong)) 672 assert(index <= str.length, "Past the end of the UTF-8 sequence"); 673 assert(index > 0, "Not the end of the UTF-8 sequence"); 674 675 if ((str[index-1] & 0b1100_0000) != 0b1000_0000) 676 return 1; 677 678 if (index >= 4) //single verification for most common case 679 { 680 static foreach (i; 2 .. 5) 681 { 682 if ((str[index-i] & 0b1100_0000) != 0b1000_0000) 683 return i; 684 } 685 } 686 else 687 { 688 static foreach (i; 2 .. 4) 689 { 690 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000) 691 return i; 692 } 693 } 694 throw new UTFException("Not the end of the UTF sequence", index); 695 } 696 697 /// Ditto 698 uint strideBack(S)(auto ref S str) 699 if (is(S : const char[]) || 700 (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char))) 701 { 702 return strideBack(str, str.length); 703 } 704 705 /// Ditto 706 uint strideBack(S)(auto ref S str) 707 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S) 708 { 709 assert(!str.empty, "Past the end of the UTF-8 sequence"); 710 auto temp = str.save; 711 foreach (i; AliasSeq!(1, 2, 3, 4)) 712 { 713 if ((temp.back & 0b1100_0000) != 0b1000_0000) 714 return i; 715 temp.popBack(); 716 if (temp.empty) 717 break; 718 } 719 throw new UTFException("The last code unit is not the end of the UTF-8 sequence"); 720 } 721 722 @system unittest 723 { 724 import core.exception : AssertError; 725 import std.conv : to; 726 import std.exception; 727 import std..string : format; 728 import std.traits : FunctionAttribute, functionAttributes, isSafe; 729 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 730 { 731 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c), 732 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 733 734 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c), 735 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 736 737 auto refRandom = new RefRandomCU!char(s); 738 immutable randLen = refRandom.length; 739 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c), 740 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 741 enforce(refRandom.length == randLen, 742 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 743 744 if (i == size_t.max) 745 { 746 enforce(strideBack(s) == codeLength!char(c), 747 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 748 749 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c), 750 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 751 752 auto refBidir = new RefBidirCU!char(s); 753 immutable bidirLen = refBidir.length; 754 enforce(strideBack(refBidir) == codeLength!char(c), 755 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 756 enforce(refBidir.length == bidirLen, 757 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 758 } 759 } 760 761 assertCTFEable!( 762 { 763 test("a", 'a'); 764 test(" ", ' '); 765 test("\u2029", '\u2029'); //paraSep 766 test("\u0100", '\u0100'); 767 test("\u0430", '\u0430'); 768 test("\U00010143", '\U00010143'); 769 test("abcdefcdef", 'f'); 770 test("\U00010143\u0100\U00010143hello", 'o', 15); 771 test("\U00010143\u0100\U00010143hello", 'l', 14); 772 test("\U00010143\u0100\U00010143hello", 'l', 13); 773 test("\U00010143\u0100\U00010143hello", 'e', 12); 774 test("\U00010143\u0100\U00010143hello", 'h', 11); 775 test("\U00010143\u0100\U00010143hello", '\U00010143', 10); 776 test("\U00010143\u0100\U00010143hello", '\u0100', 6); 777 test("\U00010143\u0100\U00010143hello", '\U00010143', 4); 778 779 foreach (S; AliasSeq!(char[], const char[], string)) 780 { 781 enum str = to!S("hello world"); 782 static assert(isSafe!({ strideBack(str, 0); })); 783 static assert(isSafe!({ strideBack(str); })); 784 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0); 785 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0); 786 } 787 }); 788 } 789 790 //UTF-16 is self synchronizing: The length of strideBack can be found from 791 //the value of a single wchar 792 /// Ditto 793 uint strideBack(S)(auto ref S str, size_t index) 794 if (is(S : const wchar[]) || 795 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 796 { 797 static if (is(typeof(str.length) : ulong)) 798 assert(index <= str.length, "Past the end of the UTF-16 sequence"); 799 assert(index > 0, "Not the end of a UTF-16 sequence"); 800 801 immutable c2 = str[index-1]; 802 return 1 + (0xDC00 <= c2 && c2 < 0xE000); 803 } 804 805 /// Ditto 806 uint strideBack(S)(auto ref S str) 807 if (is(S : const wchar[]) || 808 (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar))) 809 { 810 assert(!str.empty, "UTF-16 sequence is empty"); 811 812 static if (is(S : const(wchar)[])) 813 immutable c2 = str[$ - 1]; 814 else 815 immutable c2 = str.back; 816 817 return 1 + (0xDC00 <= c2 && c2 <= 0xE000); 818 } 819 820 @system unittest 821 { 822 import core.exception : AssertError; 823 import std.conv : to; 824 import std.exception; 825 import std..string : format; 826 import std.traits : FunctionAttribute, functionAttributes, isSafe; 827 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 828 { 829 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c), 830 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 831 832 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c), 833 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 834 835 auto refRandom = new RefRandomCU!wchar(s); 836 immutable randLen = refRandom.length; 837 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c), 838 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 839 enforce(refRandom.length == randLen, 840 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 841 842 if (i == size_t.max) 843 { 844 enforce(strideBack(s) == codeLength!wchar(c), 845 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 846 847 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c), 848 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 849 850 auto refBidir = new RefBidirCU!wchar(s); 851 immutable bidirLen = refBidir.length; 852 enforce(strideBack(refBidir) == codeLength!wchar(c), 853 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 854 enforce(refBidir.length == bidirLen, 855 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 856 } 857 } 858 859 assertCTFEable!( 860 { 861 test("a", 'a'); 862 test(" ", ' '); 863 test("\u2029", '\u2029'); //paraSep 864 test("\u0100", '\u0100'); 865 test("\u0430", '\u0430'); 866 test("\U00010143", '\U00010143'); 867 test("abcdefcdef", 'f'); 868 test("\U00010143\u0100\U00010143hello", 'o', 10); 869 test("\U00010143\u0100\U00010143hello", 'l', 9); 870 test("\U00010143\u0100\U00010143hello", 'l', 8); 871 test("\U00010143\u0100\U00010143hello", 'e', 7); 872 test("\U00010143\u0100\U00010143hello", 'h', 6); 873 test("\U00010143\u0100\U00010143hello", '\U00010143', 5); 874 test("\U00010143\u0100\U00010143hello", '\u0100', 3); 875 test("\U00010143\u0100\U00010143hello", '\U00010143', 2); 876 877 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 878 { 879 enum str = to!S("hello world"); 880 static assert(isSafe!(() => strideBack(str, 0))); 881 static assert(isSafe!(() => strideBack(str) )); 882 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 883 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 884 } 885 }); 886 } 887 888 /// Ditto 889 uint strideBack(S)(auto ref S str, size_t index) 890 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 891 { 892 static if (is(typeof(str.length) : ulong)) 893 assert(index <= str.length, "Past the end of the UTF-32 sequence"); 894 assert(index > 0, "Not the end of the UTF-32 sequence"); 895 return 1; 896 } 897 898 /// Ditto 899 uint strideBack(S)(auto ref S str) 900 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 901 { 902 assert(!str.empty, "Empty UTF-32 sequence"); 903 return 1; 904 } 905 906 /// 907 @safe unittest 908 { 909 assert("a".strideBack == 1); 910 assert("λ".strideBack == 2); 911 assert("aλ".strideBack == 2); 912 assert("aλ".strideBack(1) == 1); 913 assert("𐐷".strideBack == 4); 914 } 915 916 @system unittest 917 { 918 import core.exception : AssertError; 919 import std.conv : to; 920 import std.exception; 921 import std..string : format; 922 import std.traits : FunctionAttribute, functionAttributes, isSafe; 923 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 924 { 925 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c), 926 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 927 928 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c), 929 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 930 931 auto refRandom = new RefRandomCU!dchar(s); 932 immutable randLen = refRandom.length; 933 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c), 934 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 935 enforce(refRandom.length == randLen, 936 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 937 938 if (i == size_t.max) 939 { 940 enforce(strideBack(s) == codeLength!dchar(c), 941 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 942 943 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c), 944 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 945 946 auto refBidir = new RefBidirCU!dchar(s); 947 immutable bidirLen = refBidir.length; 948 enforce(strideBack(refBidir) == codeLength!dchar(c), 949 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 950 enforce(refBidir.length == bidirLen, 951 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 952 } 953 } 954 955 assertCTFEable!( 956 { 957 test("a", 'a'); 958 test(" ", ' '); 959 test("\u2029", '\u2029'); //paraSep 960 test("\u0100", '\u0100'); 961 test("\u0430", '\u0430'); 962 test("\U00010143", '\U00010143'); 963 test("abcdefcdef", 'f'); 964 test("\U00010143\u0100\U00010143hello", 'o', 8); 965 test("\U00010143\u0100\U00010143hello", 'l', 7); 966 test("\U00010143\u0100\U00010143hello", 'l', 6); 967 test("\U00010143\u0100\U00010143hello", 'e', 5); 968 test("\U00010143\u0100\U00010143hello", 'h', 4); 969 test("\U00010143\u0100\U00010143hello", '\U00010143', 3); 970 test("\U00010143\u0100\U00010143hello", '\u0100', 2); 971 test("\U00010143\u0100\U00010143hello", '\U00010143', 1); 972 973 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 974 { 975 enum str = to!S("hello world"); 976 static assert(isSafe!(() => strideBack(str, 0))); 977 static assert(isSafe!(() => strideBack(str) )); 978 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 979 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 980 } 981 }); 982 } 983 984 985 /++ 986 Given `index` into `str` and assuming that `index` is at the start 987 of a UTF sequence, `toUCSindex` determines the number of UCS characters 988 up to `index`. So, `index` is the index of a code unit at the 989 beginning of a code point, and the return value is how many code points into 990 the string that that code point is. 991 +/ 992 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure 993 if (isSomeChar!C) 994 { 995 static if (is(immutable C == immutable dchar)) 996 return index; 997 else 998 { 999 size_t n = 0; 1000 size_t j = 0; 1001 1002 for (; j < index; ++n) 1003 j += stride(str, j); 1004 1005 if (j > index) 1006 { 1007 static if (is(immutable C == immutable char)) 1008 throw new UTFException("Invalid UTF-8 sequence", index); 1009 else 1010 throw new UTFException("Invalid UTF-16 sequence", index); 1011 } 1012 1013 return n; 1014 } 1015 } 1016 1017 /// 1018 @safe unittest 1019 { 1020 assert(toUCSindex(`hello world`, 7) == 7); 1021 assert(toUCSindex(`hello world`w, 7) == 7); 1022 assert(toUCSindex(`hello world`d, 7) == 7); 1023 1024 assert(toUCSindex(`Ma Chérie`, 7) == 6); 1025 assert(toUCSindex(`Ma Chérie`w, 7) == 7); 1026 assert(toUCSindex(`Ma Chérie`d, 7) == 7); 1027 1028 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3); 1029 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1030 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1031 } 1032 1033 1034 /++ 1035 Given a UCS index `n` into `str`, returns the UTF index. 1036 So, `n` is how many code points into the string the code point is, and 1037 the array index of the code unit is returned. 1038 +/ 1039 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure 1040 if (isSomeChar!C) 1041 { 1042 static if (is(immutable C == immutable dchar)) 1043 { 1044 return n; 1045 } 1046 else 1047 { 1048 size_t i; 1049 while (n--) 1050 { 1051 i += stride(str, i); 1052 } 1053 return i; 1054 } 1055 } 1056 1057 /// 1058 @safe unittest 1059 { 1060 assert(toUTFindex(`hello world`, 7) == 7); 1061 assert(toUTFindex(`hello world`w, 7) == 7); 1062 assert(toUTFindex(`hello world`d, 7) == 7); 1063 1064 assert(toUTFindex(`Ma Chérie`, 6) == 7); 1065 assert(toUTFindex(`Ma Chérie`w, 7) == 7); 1066 assert(toUTFindex(`Ma Chérie`d, 7) == 7); 1067 1068 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9); 1069 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1070 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1071 } 1072 1073 1074 /* =================== Decode ======================= */ 1075 1076 /// Whether or not to replace invalid UTF with $(LREF replacementDchar) 1077 alias UseReplacementDchar = Flag!"useReplacementDchar"; 1078 1079 // Reduce distinct instantiations of decodeImpl. 1080 private template TypeForDecode(T) 1081 { 1082 import std.traits : isDynamicArray; 1083 static if (isDynamicArray!T && is(T : E[], E) && __traits(isArithmetic, E) && !is(E == shared)) 1084 alias TypeForDecode = const(Unqual!E)[]; 1085 else 1086 alias TypeForDecode = T; 1087 } 1088 1089 /++ 1090 Decodes and returns the code point starting at `str[index]`. `index` 1091 is advanced to one past the decoded code point. If the code point is not 1092 well-formed, then a `UTFException` is thrown and `index` remains 1093 unchanged. 1094 1095 decode will only work with strings and random access ranges of code units 1096 with length and slicing, whereas $(LREF decodeFront) will work with any 1097 input range of code units. 1098 1099 Params: 1100 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1101 str = input string or indexable Range 1102 index = starting index into s[]; incremented by number of code units processed 1103 1104 Returns: 1105 decoded character 1106 1107 Throws: 1108 $(LREF UTFException) if `str[index]` is not the start of a valid UTF 1109 sequence and useReplacementDchar is `No.useReplacementDchar` 1110 +/ 1111 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index) 1112 if (!isSomeString!S && 1113 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S)) 1114 in 1115 { 1116 assert(index < str.length, "Attempted to decode past the end of a string"); 1117 } 1118 out (result) 1119 { 1120 assert(isValidDchar(result)); 1121 } 1122 do 1123 { 1124 if (str[index] < codeUnitLimit!S) 1125 return str[index++]; 1126 else 1127 return decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index); 1128 } 1129 1130 /// ditto 1131 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1132 auto ref S str, ref size_t index) @trusted pure 1133 if (isSomeString!S) 1134 in 1135 { 1136 assert(index < str.length, "Attempted to decode past the end of a string"); 1137 } 1138 out (result) 1139 { 1140 assert(isValidDchar(result)); 1141 } 1142 do 1143 { 1144 if (str[index] < codeUnitLimit!S) 1145 return str[index++]; 1146 else 1147 return decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index); 1148 } 1149 1150 /// 1151 @safe pure unittest 1152 { 1153 size_t i; 1154 1155 assert("a".decode(i) == 'a' && i == 1); 1156 i = 0; 1157 assert("å".decode(i) == 'å' && i == 2); 1158 i = 1; 1159 assert("aå".decode(i) == 'å' && i == 3); 1160 i = 0; 1161 assert("å"w.decode(i) == 'å' && i == 1); 1162 1163 // ë as a multi-code point grapheme 1164 i = 0; 1165 assert("e\u0308".decode(i) == 'e' && i == 1); 1166 // ë as a single code point grapheme 1167 i = 0; 1168 assert("ë".decode(i) == 'ë' && i == 2); 1169 i = 0; 1170 assert("ë"w.decode(i) == 'ë' && i == 1); 1171 } 1172 1173 /++ 1174 `decodeFront` is a variant of $(LREF decode) which specifically decodes 1175 the first code point. Unlike $(LREF decode), `decodeFront` accepts any 1176 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 1177 of code units (rather than just a string or random access 1178 range). It also takes the range by `ref` and pops off the elements as it 1179 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1180 of code units which were in the code point which was decoded. 1181 1182 Params: 1183 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1184 str = input string or indexable Range 1185 numCodeUnits = set to number of code units processed 1186 1187 Returns: 1188 decoded character 1189 1190 Throws: 1191 $(LREF UTFException) if `str.front` is not the start of a valid UTF 1192 sequence. If an exception is thrown, then there is no guarantee as to 1193 the number of code units which were popped off, as it depends on the 1194 type of range being used and how many code units had to be popped off 1195 before the code point was determined to be invalid. 1196 +/ 1197 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1198 ref S str, out size_t numCodeUnits) 1199 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S)) 1200 in 1201 { 1202 assert(!str.empty); 1203 } 1204 out (result) 1205 { 1206 assert(isValidDchar(result)); 1207 } 1208 do 1209 { 1210 immutable fst = str.front; 1211 1212 if (fst < codeUnitLimit!S) 1213 { 1214 str.popFront(); 1215 numCodeUnits = 1; 1216 return fst; 1217 } 1218 else 1219 { 1220 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be 1221 // done outside of decodeImpl, which is undesirable, since not all 1222 // overloads of decodeImpl need it. So, it should be moved back into 1223 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521 1224 // has been fixed. 1225 enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S; 1226 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(cast(TypeForDecode!S) str, numCodeUnits); 1227 1228 // The other range types were already popped by decodeImpl. 1229 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1230 str = str[numCodeUnits .. str.length]; 1231 1232 return retval; 1233 } 1234 } 1235 1236 /// ditto 1237 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1238 ref S str, out size_t numCodeUnits) @trusted pure 1239 if (isSomeString!S) 1240 in 1241 { 1242 assert(!str.empty); 1243 } 1244 out (result) 1245 { 1246 assert(isValidDchar(result)); 1247 } 1248 do 1249 { 1250 if (str[0] < codeUnitLimit!S) 1251 { 1252 numCodeUnits = 1; 1253 immutable retval = str[0]; 1254 str = str[1 .. $]; 1255 return retval; 1256 } 1257 else 1258 { 1259 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, numCodeUnits); 1260 str = str[numCodeUnits .. $]; 1261 return retval; 1262 } 1263 } 1264 1265 /++ Ditto +/ 1266 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1267 if (isInputRange!S && isSomeChar!(ElementType!S)) 1268 { 1269 size_t numCodeUnits; 1270 return decodeFront!useReplacementDchar(str, numCodeUnits); 1271 } 1272 1273 /// 1274 @safe pure unittest 1275 { 1276 import std.range.primitives; 1277 string str = "Hello, World!"; 1278 1279 assert(str.decodeFront == 'H' && str == "ello, World!"); 1280 str = "å"; 1281 assert(str.decodeFront == 'å' && str.empty); 1282 str = "å"; 1283 size_t i; 1284 assert(str.decodeFront(i) == 'å' && i == 2 && str.empty); 1285 } 1286 1287 /++ 1288 `decodeBack` is a variant of $(LREF decode) which specifically decodes 1289 the last code point. Unlike $(LREF decode), `decodeBack` accepts any 1290 bidirectional range of code units (rather than just a string or random access 1291 range). It also takes the range by `ref` and pops off the elements as it 1292 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1293 of code units which were in the code point which was decoded. 1294 1295 Params: 1296 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing 1297 str = input string or bidirectional Range 1298 numCodeUnits = gives the number of code units processed 1299 1300 Returns: 1301 A decoded UTF character. 1302 1303 Throws: 1304 $(LREF UTFException) if `str.back` is not the end of a valid UTF 1305 sequence. If an exception is thrown, the `str` itself remains unchanged, 1306 but there is no guarantee as to the value of `numCodeUnits` (when passed). 1307 +/ 1308 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1309 ref S str, out size_t numCodeUnits) 1310 if (isSomeString!S) 1311 in 1312 { 1313 assert(!str.empty); 1314 } 1315 out (result) 1316 { 1317 assert(isValidDchar(result)); 1318 } 1319 do 1320 { 1321 if (str[$ - 1] < codeUnitLimit!S) 1322 { 1323 numCodeUnits = 1; 1324 immutable retval = str[$ - 1]; 1325 str = str[0 .. $ - 1]; 1326 return retval; 1327 } 1328 else 1329 { 1330 numCodeUnits = strideBack(str); 1331 immutable newLength = str.length - numCodeUnits; 1332 size_t index = newLength; 1333 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index); 1334 str = str[0 .. newLength]; 1335 return retval; 1336 } 1337 } 1338 1339 /++ Ditto +/ 1340 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1341 ref S str, out size_t numCodeUnits) 1342 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S 1343 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S)) 1344 in 1345 { 1346 assert(!str.empty); 1347 } 1348 out (result) 1349 { 1350 assert(isValidDchar(result)); 1351 } 1352 do 1353 { 1354 if (str.back < codeUnitLimit!S) 1355 { 1356 numCodeUnits = 1; 1357 immutable retval = str.back; 1358 str.popBack(); 1359 return retval; 1360 } 1361 else 1362 { 1363 numCodeUnits = strideBack(str); 1364 static if (isRandomAccessRange!S) 1365 { 1366 size_t index = str.length - numCodeUnits; 1367 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index); 1368 str.popBackExactly(numCodeUnits); 1369 return retval; 1370 } 1371 else 1372 { 1373 alias Char = Unqual!(ElementType!S); 1374 Char[4] codeUnits; 1375 S tmp = str.save; 1376 for (size_t i = numCodeUnits; i > 0; ) 1377 { 1378 codeUnits[--i] = tmp.back; 1379 tmp.popBack(); 1380 } 1381 const Char[] codePoint = codeUnits[0 .. numCodeUnits]; 1382 size_t index = 0; 1383 immutable retval = decodeImpl!(true, useReplacementDchar)( 1384 cast(TypeForDecode!(typeof(codePoint))) codePoint, index); 1385 str = tmp; 1386 return retval; 1387 } 1388 } 1389 } 1390 1391 /++ Ditto +/ 1392 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1393 if (isSomeString!S 1394 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S)) 1395 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S))) 1396 in 1397 { 1398 assert(!str.empty); 1399 } 1400 out (result) 1401 { 1402 assert(isValidDchar(result)); 1403 } 1404 do 1405 { 1406 size_t numCodeUnits; 1407 return decodeBack!useReplacementDchar(str, numCodeUnits); 1408 } 1409 1410 /// 1411 @system pure unittest 1412 { 1413 import std.range.primitives; 1414 string str = "Hello, World!"; 1415 1416 assert(str.decodeBack == '!' && str == "Hello, World"); 1417 str = "å"; 1418 assert(str.decodeBack == 'å' && str.empty); 1419 str = "å"; 1420 size_t i; 1421 assert(str.decodeBack(i) == 'å' && i == 2 && str.empty); 1422 } 1423 1424 // Gives the maximum value that a code unit for the given range type can hold. 1425 package template codeUnitLimit(S) 1426 if (isSomeChar!(ElementEncodingType!S)) 1427 { 1428 static if (is(immutable ElementEncodingType!S == immutable char)) 1429 enum char codeUnitLimit = 0x80; 1430 else static if (is(immutable ElementEncodingType!S == immutable wchar)) 1431 enum wchar codeUnitLimit = 0xD800; 1432 else 1433 enum dchar codeUnitLimit = 0xD800; 1434 } 1435 1436 /* 1437 * For strings, this function does its own bounds checking to give a 1438 * more useful error message when attempting to decode past the end of a string. 1439 * Subsequently it uses a pointer instead of an array to avoid 1440 * redundant bounds checking. 1441 * 1442 * The three overloads of this operate on chars, wchars, and dchars. 1443 * 1444 * Params: 1445 * canIndex = if S is indexable 1446 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1447 * str = input string or Range 1448 * index = starting index into s[]; incremented by number of code units processed 1449 * 1450 * Returns: 1451 * decoded character 1452 */ 1453 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1454 auto ref S str, ref size_t index) 1455 if ( 1456 is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char))) 1457 { 1458 /* The following encodings are valid, except for the 5 and 6 byte 1459 * combinations: 1460 * 0xxxxxxx 1461 * 110xxxxx 10xxxxxx 1462 * 1110xxxx 10xxxxxx 10xxxxxx 1463 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1464 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1465 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1466 */ 1467 1468 /* Dchar bitmask for different numbers of UTF-8 code units. 1469 */ 1470 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 1471 1472 static if (is(S : const char[])) 1473 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code 1474 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1475 auto pstr = str[index .. str.length]; 1476 else 1477 alias pstr = str; 1478 1479 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1480 // outside of decodeImpl 1481 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1482 1483 static if (canIndex) 1484 { 1485 immutable length = str.length - index; 1486 ubyte fst = pstr[0]; 1487 } 1488 else 1489 { 1490 ubyte fst = pstr.front; 1491 pstr.popFront(); 1492 } 1493 1494 static if (!useReplacementDchar) 1495 { 1496 static if (canIndex) 1497 { 1498 static UTFException exception(S)(S str, string msg) 1499 { 1500 uint[4] sequence = void; 1501 size_t i; 1502 1503 do 1504 { 1505 sequence[i] = str[i]; 1506 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80); 1507 1508 return new UTFException(msg, i).setSequence(sequence[0 .. i]); 1509 } 1510 } 1511 1512 UTFException invalidUTF() 1513 { 1514 static if (canIndex) 1515 return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); 1516 else 1517 { 1518 //We can't include the invalid sequence with input strings without 1519 //saving each of the code units along the way, and we can't do it with 1520 //forward ranges without saving the entire range. Both would incur a 1521 //cost for the decoding of every character just to provide a better 1522 //error message for the (hopefully) rare case when an invalid UTF-8 1523 //sequence is encountered, so we don't bother trying to include the 1524 //invalid sequence here, unlike with strings and sliceable ranges. 1525 return new UTFException("Invalid UTF-8 sequence"); 1526 } 1527 } 1528 1529 UTFException outOfBounds() 1530 { 1531 static if (canIndex) 1532 return exception(pstr[0 .. length], "Attempted to decode past the end of a string"); 1533 else 1534 return new UTFException("Attempted to decode past the end of a string"); 1535 } 1536 } 1537 1538 if ((fst & 0b1100_0000) != 0b1100_0000) 1539 { 1540 static if (useReplacementDchar) 1541 { 1542 ++index; // always consume bad input to avoid infinite loops 1543 return replacementDchar; 1544 } 1545 else 1546 throw invalidUTF(); // starter must have at least 2 first bits set 1547 } 1548 ubyte tmp = void; 1549 dchar d = fst; // upper control bits are masked out later 1550 fst <<= 1; 1551 1552 foreach (i; AliasSeq!(1, 2, 3)) 1553 { 1554 1555 static if (canIndex) 1556 { 1557 if (i == length) 1558 { 1559 static if (useReplacementDchar) 1560 { 1561 index += i; 1562 return replacementDchar; 1563 } 1564 else 1565 throw outOfBounds(); 1566 } 1567 } 1568 else 1569 { 1570 if (pstr.empty) 1571 { 1572 static if (useReplacementDchar) 1573 { 1574 index += i; 1575 return replacementDchar; 1576 } 1577 else 1578 throw outOfBounds(); 1579 } 1580 } 1581 1582 static if (canIndex) 1583 tmp = pstr[i]; 1584 else 1585 { 1586 tmp = pstr.front; 1587 pstr.popFront(); 1588 } 1589 1590 if ((tmp & 0xC0) != 0x80) 1591 { 1592 static if (useReplacementDchar) 1593 { 1594 index += i + 1; 1595 return replacementDchar; 1596 } 1597 else 1598 throw invalidUTF(); 1599 } 1600 1601 d = (d << 6) | (tmp & 0x3F); 1602 fst <<= 1; 1603 1604 if (!(fst & 0x80)) // no more bytes 1605 { 1606 d &= bitMask[i]; // mask out control bits 1607 1608 // overlong, could have been encoded with i bytes 1609 if ((d & ~bitMask[i - 1]) == 0) 1610 { 1611 static if (useReplacementDchar) 1612 { 1613 index += i + 1; 1614 return replacementDchar; 1615 } 1616 else 1617 throw invalidUTF(); 1618 } 1619 1620 // check for surrogates only needed for 3 bytes 1621 static if (i == 2) 1622 { 1623 if (!isValidDchar(d)) 1624 { 1625 static if (useReplacementDchar) 1626 { 1627 index += i + 1; 1628 return replacementDchar; 1629 } 1630 else 1631 throw invalidUTF(); 1632 } 1633 } 1634 1635 index += i + 1; 1636 static if (i == 3) 1637 { 1638 if (d > dchar.max) 1639 { 1640 static if (useReplacementDchar) 1641 d = replacementDchar; 1642 else 1643 throw invalidUTF(); 1644 } 1645 } 1646 return d; 1647 } 1648 } 1649 1650 static if (useReplacementDchar) 1651 { 1652 index += 4; // read 4 chars by now 1653 return replacementDchar; 1654 } 1655 else 1656 throw invalidUTF(); 1657 } 1658 1659 @safe pure @nogc nothrow 1660 unittest 1661 { 1662 // Add tests for useReplacemendDchar == yes path 1663 1664 static struct R 1665 { 1666 @safe pure @nogc nothrow: 1667 this(string s) { this.s = s; } 1668 @property bool empty() { return idx == s.length; } 1669 @property char front() { return s[idx]; } 1670 void popFront() { ++idx; } 1671 size_t idx; 1672 string s; 1673 } 1674 1675 foreach (s; invalidUTFstrings!char()) 1676 { 1677 auto r = R(s); 1678 size_t index; 1679 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1680 assert(dc == replacementDchar); 1681 assert(1 <= index && index <= s.length); 1682 } 1683 } 1684 1685 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S) 1686 (auto ref S str, ref size_t index) 1687 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar))) 1688 { 1689 static if (is(S : const wchar[])) 1690 auto pstr = str.ptr + index; 1691 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1692 auto pstr = str[index .. str.length]; 1693 else 1694 alias pstr = str; 1695 1696 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1697 // outside of decodeImpl 1698 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1699 1700 static if (canIndex) 1701 { 1702 immutable length = str.length - index; 1703 uint u = pstr[0]; 1704 } 1705 else 1706 { 1707 uint u = pstr.front; 1708 pstr.popFront(); 1709 } 1710 1711 static if (!useReplacementDchar) 1712 { 1713 UTFException exception(string msg) 1714 { 1715 static if (canIndex) 1716 return new UTFException(msg).setSequence(pstr[0]); 1717 else 1718 return new UTFException(msg); 1719 } 1720 } 1721 1722 // The < case must be taken care of before decodeImpl is called. 1723 assert(u >= 0xD800); 1724 1725 if (u <= 0xDBFF) 1726 { 1727 static if (canIndex) 1728 immutable onlyOneCodeUnit = length == 1; 1729 else 1730 immutable onlyOneCodeUnit = pstr.empty; 1731 1732 if (onlyOneCodeUnit) 1733 { 1734 static if (useReplacementDchar) 1735 { 1736 ++index; 1737 return replacementDchar; 1738 } 1739 else 1740 throw exception("surrogate UTF-16 high value past end of string"); 1741 } 1742 1743 static if (canIndex) 1744 immutable uint u2 = pstr[1]; 1745 else 1746 { 1747 immutable uint u2 = pstr.front; 1748 pstr.popFront(); 1749 } 1750 1751 if (u2 < 0xDC00 || u2 > 0xDFFF) 1752 { 1753 static if (useReplacementDchar) 1754 u = replacementDchar; 1755 else 1756 throw exception("surrogate UTF-16 low value out of range"); 1757 } 1758 else 1759 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 1760 ++index; 1761 } 1762 else if (u >= 0xDC00 && u <= 0xDFFF) 1763 { 1764 static if (useReplacementDchar) 1765 u = replacementDchar; 1766 else 1767 throw exception("unpaired surrogate UTF-16 value"); 1768 } 1769 ++index; 1770 1771 // Note: u+FFFE and u+FFFF are specifically permitted by the 1772 // Unicode standard for application internal use (see isValidDchar) 1773 1774 return cast(dchar) u; 1775 } 1776 1777 @safe pure @nogc nothrow 1778 unittest 1779 { 1780 // Add tests for useReplacemendDchar == true path 1781 1782 static struct R 1783 { 1784 @safe pure @nogc nothrow: 1785 this(wstring s) { this.s = s; } 1786 @property bool empty() { return idx == s.length; } 1787 @property wchar front() { return s[idx]; } 1788 void popFront() { ++idx; } 1789 size_t idx; 1790 wstring s; 1791 } 1792 1793 foreach (s; invalidUTFstrings!wchar()) 1794 { 1795 auto r = R(s); 1796 size_t index; 1797 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1798 assert(dc == replacementDchar); 1799 assert(1 <= index && index <= s.length); 1800 } 1801 } 1802 1803 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1804 auto ref S str, ref size_t index) 1805 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 1806 { 1807 static if (is(S : const dchar[])) 1808 auto pstr = str.ptr; 1809 else 1810 alias pstr = str; 1811 1812 static if (is(S : const dchar[]) || isRandomAccessRange!S) 1813 { 1814 dchar dc = pstr[index]; 1815 if (!isValidDchar(dc)) 1816 { 1817 static if (useReplacementDchar) 1818 dc = replacementDchar; 1819 else 1820 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1821 } 1822 ++index; 1823 return dc; 1824 } 1825 else 1826 { 1827 dchar dc = pstr.front; 1828 if (!isValidDchar(dc)) 1829 { 1830 static if (useReplacementDchar) 1831 dc = replacementDchar; 1832 else 1833 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1834 } 1835 ++index; 1836 pstr.popFront(); 1837 return dc; 1838 } 1839 } 1840 1841 @safe pure @nogc nothrow 1842 unittest 1843 { 1844 // Add tests for useReplacemendDchar == true path 1845 1846 static struct R 1847 { 1848 @safe pure @nogc nothrow: 1849 this(dstring s) { this.s = s; } 1850 @property bool empty() { return idx == s.length; } 1851 @property dchar front() { return s[idx]; } 1852 void popFront() { ++idx; } 1853 size_t idx; 1854 dstring s; 1855 } 1856 1857 foreach (s; invalidUTFstrings!dchar()) 1858 { 1859 auto r = R(s); 1860 size_t index; 1861 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1862 assert(dc == replacementDchar); 1863 assert(1 <= index && index <= s.length); 1864 } 1865 } 1866 1867 1868 version (StdUnittest) private void testDecode(R)(R range, 1869 size_t index, 1870 dchar expectedChar, 1871 size_t expectedIndex, 1872 size_t line = __LINE__) 1873 { 1874 import core.exception : AssertError; 1875 import std.exception : enforce; 1876 import std..string : format; 1877 import std.traits : isNarrowString; 1878 1879 static if (hasLength!R) 1880 immutable lenBefore = range.length; 1881 1882 static if (isRandomAccessRange!R && !isNarrowString!R) 1883 { 1884 { 1885 immutable result = decode(range, index); 1886 enforce(result == expectedChar, 1887 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line)); 1888 enforce(index == expectedIndex, 1889 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1890 static if (hasLength!R) 1891 { 1892 enforce(range.length == lenBefore, 1893 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line)); 1894 } 1895 } 1896 } 1897 } 1898 1899 version (StdUnittest) private void testDecodeFront(R)(ref R range, 1900 dchar expectedChar, 1901 size_t expectedNumCodeUnits, 1902 size_t line = __LINE__) 1903 { 1904 import core.exception : AssertError; 1905 import std.exception : enforce; 1906 import std..string : format; 1907 1908 static if (hasLength!R) 1909 immutable lenBefore = range.length; 1910 1911 size_t numCodeUnits; 1912 immutable result = decodeFront(range, numCodeUnits); 1913 enforce(result == expectedChar, 1914 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line)); 1915 enforce(numCodeUnits == expectedNumCodeUnits, 1916 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1917 1918 static if (hasLength!R) 1919 { 1920 enforce(range.length == lenBefore - numCodeUnits, 1921 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line)); 1922 } 1923 } 1924 1925 version (StdUnittest) private void testDecodeBack(R)(ref R range, 1926 dchar expectedChar, 1927 size_t expectedNumCodeUnits, 1928 size_t line = __LINE__) 1929 { 1930 // This condition is to allow unit testing all `decode` functions together 1931 static if (!isBidirectionalRange!R) 1932 return; 1933 else 1934 { 1935 import core.exception : AssertError; 1936 import std.exception : enforce; 1937 import std..string : format; 1938 1939 static if (hasLength!R) 1940 immutable lenBefore = range.length; 1941 1942 size_t numCodeUnits; 1943 immutable result = decodeBack(range, numCodeUnits); 1944 enforce(result == expectedChar, 1945 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line)); 1946 enforce(numCodeUnits == expectedNumCodeUnits, 1947 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1948 1949 static if (hasLength!R) 1950 { 1951 enforce(range.length == lenBefore - numCodeUnits, 1952 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line)); 1953 } 1954 } 1955 } 1956 1957 version (StdUnittest) private void testAllDecode(R)(R range, 1958 dchar expectedChar, 1959 size_t expectedIndex, 1960 size_t line = __LINE__) 1961 { 1962 testDecode(range, 0, expectedChar, expectedIndex, line); 1963 static if (isBidirectionalRange!R) 1964 { 1965 auto rangeCopy = range.save; 1966 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line); 1967 } 1968 testDecodeFront(range, expectedChar, expectedIndex, line); 1969 } 1970 1971 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__) 1972 { 1973 import core.exception : AssertError; 1974 import std.exception : assertThrown, enforce; 1975 import std..string : format; 1976 1977 immutable initialIndex = index; 1978 1979 static if (hasLength!R) 1980 immutable lenBefore = range.length; 1981 1982 static if (isRandomAccessRange!R) 1983 { 1984 assertThrown!UTFException(decode(range, index), null, __FILE__, line); 1985 enforce(index == initialIndex, 1986 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1987 static if (hasLength!R) 1988 { 1989 enforce(range.length == lenBefore, 1990 new AssertError(format("decode: length changed:", range.length), __FILE__, line)); 1991 } 1992 } 1993 1994 if (initialIndex == 0) 1995 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line); 1996 } 1997 1998 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__) 1999 { 2000 // This condition is to allow unit testing all `decode` functions together 2001 static if (!isBidirectionalRange!R) 2002 return; 2003 else 2004 { 2005 import core.exception : AssertError; 2006 import std.exception : assertThrown, enforce; 2007 import std..string : format; 2008 2009 static if (hasLength!R) 2010 immutable lenBefore = range.length; 2011 2012 static if (isRandomAccessRange!R) 2013 { 2014 assertThrown!UTFException(decodeBack(range), null, __FILE__, line); 2015 static if (hasLength!R) 2016 { 2017 enforce(range.length == lenBefore, 2018 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line)); 2019 } 2020 } 2021 } 2022 } 2023 2024 @system unittest 2025 { 2026 import std.conv : to; 2027 import std.exception; 2028 2029 assertCTFEable!( 2030 { 2031 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char, 2032 (string s) => new RefBidirCU!char(s), 2033 (string s) => new RefRandomCU!char(s))) 2034 { 2035 enum sHasLength = hasLength!(typeof(S("abcd"))); 2036 2037 { 2038 auto range = S("abcd"); 2039 testDecode(range, 0, 'a', 1); 2040 testDecode(range, 1, 'b', 2); 2041 testDecodeFront(range, 'a', 1); 2042 testDecodeFront(range, 'b', 1); 2043 assert(decodeFront(range) == 'c'); 2044 assert(decodeFront(range) == 'd'); 2045 } 2046 2047 { 2048 auto range = S("ウェブサイト"); 2049 testDecode(range, 0, 'ウ', 3); 2050 testDecode(range, 3, 'ェ', 6); 2051 testDecodeFront(range, 'ウ', 3); 2052 testDecodeFront(range, 'ェ', 3); 2053 assert(decodeFront(range) == 'ブ'); 2054 assert(decodeFront(range) == 'サ'); 2055 } 2056 2057 { 2058 auto range = S("abcd"); 2059 testDecodeBack(range, 'd', 1); 2060 testDecodeBack(range, 'c', 1); 2061 testDecodeBack(range, 'b', 1); 2062 testDecodeBack(range, 'a', 1); 2063 } 2064 2065 { 2066 auto range = S("ウェブサイト"); 2067 testDecodeBack(range, 'ト', 3); 2068 testDecodeBack(range, 'イ', 3); 2069 testDecodeBack(range, 'サ', 3); 2070 testDecodeBack(range, 'ブ', 3); 2071 } 2072 2073 testAllDecode(S("\xC2\xA9"), '\u00A9', 2); 2074 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3); 2075 2076 foreach (str; ["\xE2\x89", // too short 2077 "\xC0\x8A", 2078 "\xE0\x80\x8A", 2079 "\xF0\x80\x80\x8A", 2080 "\xF8\x80\x80\x80\x8A", 2081 "\xFC\x80\x80\x80\x80\x8A"]) 2082 { 2083 testBadDecode(S(str), 0); 2084 testBadDecode(S(str), 1); 2085 testBadDecodeBack(S(str)); 2086 } 2087 2088 //Invalid UTF-8 sequence where the first code unit is valid. 2089 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3); 2090 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3); 2091 2092 //Invalid UTF-8 sequence where the first code unit isn't valid. 2093 foreach (str; ["\xED\xA0\x80", 2094 "\xED\xAD\xBF", 2095 "\xED\xAE\x80", 2096 "\xED\xAF\xBF", 2097 "\xED\xB0\x80", 2098 "\xED\xBE\x80", 2099 "\xED\xBF\xBF"]) 2100 { 2101 testBadDecode(S(str), 0); 2102 testBadDecodeBack(S(str)); 2103 } 2104 } 2105 }); 2106 } 2107 2108 @system unittest 2109 { 2110 import std.exception; 2111 assertCTFEable!( 2112 { 2113 foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar, 2114 (wstring s) => new RefBidirCU!wchar(s), 2115 (wstring s) => new RefRandomCU!wchar(s))) 2116 { 2117 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1); 2118 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2); 2119 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2); 2120 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2121 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2122 2123 testBadDecode(S([ cast(wchar) 0xD801 ]), 0); 2124 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0); 2125 2126 testBadDecodeBack(S([ cast(wchar) 0xD801 ])); 2127 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ])); 2128 2129 { 2130 auto range = S("ウェブサイト"); 2131 testDecode(range, 0, 'ウ', 1); 2132 testDecode(range, 1, 'ェ', 2); 2133 testDecodeFront(range, 'ウ', 1); 2134 testDecodeFront(range, 'ェ', 1); 2135 assert(decodeFront(range) == 'ブ'); 2136 assert(decodeFront(range) == 'サ'); 2137 } 2138 2139 { 2140 auto range = S("ウェブサイト"); 2141 testDecodeBack(range, 'ト', 1); 2142 testDecodeBack(range, 'イ', 1); 2143 testDecodeBack(range, 'サ', 1); 2144 testDecodeBack(range, 'ブ', 1); 2145 } 2146 } 2147 2148 foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s))) 2149 { 2150 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00, 2151 cast(wchar) 0x1400, 2152 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]); 2153 testDecode(str, 0, cast(dchar) 0x10000, 2); 2154 testDecode(str, 2, cast(dchar) 0x1400, 3); 2155 testDecode(str, 3, cast(dchar) 0xB9DDE, 5); 2156 testDecodeBack(str, cast(dchar) 0xB9DDE, 2); 2157 testDecodeBack(str, cast(dchar) 0x1400, 1); 2158 testDecodeBack(str, cast(dchar) 0x10000, 2); 2159 } 2160 }); 2161 } 2162 2163 @system unittest 2164 { 2165 import std.exception; 2166 assertCTFEable!( 2167 { 2168 foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar, 2169 (dstring s) => new RefBidirCU!dchar(s), 2170 (dstring s) => new RefRandomCU!dchar(s))) 2171 { 2172 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1); 2173 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1); 2174 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1); 2175 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2176 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2177 2178 testBadDecode(S([cast(dchar) 0xD800]), 0); 2179 testBadDecode(S([cast(dchar) 0xDFFE]), 0); 2180 testBadDecode(S([cast(dchar) 0x110000]), 0); 2181 2182 testBadDecodeBack(S([cast(dchar) 0xD800])); 2183 testBadDecodeBack(S([cast(dchar) 0xDFFE])); 2184 testBadDecodeBack(S([cast(dchar) 0x110000])); 2185 2186 { 2187 auto range = S("ウェブサイト"); 2188 testDecode(range, 0, 'ウ', 1); 2189 testDecode(range, 1, 'ェ', 2); 2190 testDecodeFront(range, 'ウ', 1); 2191 testDecodeFront(range, 'ェ', 1); 2192 assert(decodeFront(range) == 'ブ'); 2193 assert(decodeFront(range) == 'サ'); 2194 } 2195 2196 { 2197 auto range = S("ウェブサイト"); 2198 testDecodeBack(range, 'ト', 1); 2199 testDecodeBack(range, 'イ', 1); 2200 testDecodeBack(range, 'サ', 1); 2201 testDecodeBack(range, 'ブ', 1); 2202 } 2203 } 2204 2205 foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s))) 2206 { 2207 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]); 2208 testDecode(str, 0, 0x10000, 1); 2209 testDecode(str, 1, 0x1400, 2); 2210 testDecode(str, 2, 0xB9DDE, 3); 2211 testDecodeBack(str, cast(dchar) 0xB9DDE, 1); 2212 testDecodeBack(str, cast(dchar) 0x1400, 1); 2213 testDecodeBack(str, cast(dchar) 0x10000, 1); 2214 } 2215 }); 2216 } 2217 2218 @safe unittest 2219 { 2220 import std.exception; 2221 import std.traits : FunctionAttribute, functionAttributes, isSafe; 2222 assertCTFEable!( 2223 { 2224 foreach (S; AliasSeq!( char[], const( char)[], string, 2225 wchar[], const(wchar)[], wstring, 2226 dchar[], const(dchar)[], dstring)) 2227 { 2228 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); })); 2229 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); })); 2230 static assert(isSafe!({ S str; decodeFront(str); })); 2231 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0); 2232 static assert((functionAttributes!({ 2233 S str; size_t i = 0; decodeFront(str, i); 2234 }) & FunctionAttribute.pure_) != 0); 2235 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0); 2236 static assert((functionAttributes!({ 2237 S str; size_t i = 0; decodeBack(str, i); 2238 }) & FunctionAttribute.pure_) != 0); 2239 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0); 2240 } 2241 }); 2242 } 2243 2244 @safe unittest 2245 { 2246 import std.exception; 2247 char[4] val; 2248 val[0] = 0b1111_0111; 2249 val[1] = 0b1011_1111; 2250 val[2] = 0b1011_1111; 2251 val[3] = 0b1011_1111; 2252 size_t i = 0; 2253 assertThrown!UTFException((){ dchar ch = decode(val[], i); }()); 2254 } 2255 /* =================== Encode ======================= */ 2256 2257 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c) 2258 { 2259 static if (useReplacementDchar) 2260 return replacementDchar; 2261 else 2262 throw new UTFException(msg).setSequence(c); 2263 } 2264 2265 /++ 2266 Encodes `c` into the static array, `buf`, and returns the actual 2267 length of the encoded character (a number between `1` and `4` for 2268 `char[4]` buffers and a number between `1` and `2` for 2269 `wchar[2]` buffers). 2270 2271 Throws: 2272 `UTFException` if `c` is not a valid UTF code point. 2273 +/ 2274 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2275 out char[4] buf, dchar c) @safe pure 2276 { 2277 if (c <= 0x7F) 2278 { 2279 assert(isValidDchar(c)); 2280 buf[0] = cast(char) c; 2281 return 1; 2282 } 2283 if (c <= 0x7FF) 2284 { 2285 assert(isValidDchar(c)); 2286 buf[0] = cast(char)(0xC0 | (c >> 6)); 2287 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2288 return 2; 2289 } 2290 if (c <= 0xFFFF) 2291 { 2292 if (0xD800 <= c && c <= 0xDFFF) 2293 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2294 2295 assert(isValidDchar(c)); 2296 L3: 2297 buf[0] = cast(char)(0xE0 | (c >> 12)); 2298 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2299 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2300 return 3; 2301 } 2302 if (c <= 0x10FFFF) 2303 { 2304 assert(isValidDchar(c)); 2305 buf[0] = cast(char)(0xF0 | (c >> 18)); 2306 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2307 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2308 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2309 return 4; 2310 } 2311 2312 assert(!isValidDchar(c)); 2313 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2314 goto L3; 2315 } 2316 2317 /// 2318 @safe unittest 2319 { 2320 import std.exception : assertThrown; 2321 import std.typecons : Yes; 2322 2323 char[4] buf; 2324 2325 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2326 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2327 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2328 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2329 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2330 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2331 2332 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2333 auto slice = buf[]; 2334 assert(slice.decodeFront == replacementDchar); 2335 } 2336 2337 /// 2338 @safe unittest 2339 { 2340 import std.exception : assertThrown; 2341 import std.typecons : Yes; 2342 2343 wchar[2] buf; 2344 2345 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2346 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2347 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2348 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2349 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2350 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2351 2352 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2353 auto slice = buf[]; 2354 assert(slice.decodeFront == replacementDchar); 2355 } 2356 2357 /// 2358 @safe unittest 2359 { 2360 import std.exception : assertThrown; 2361 import std.typecons : Yes; 2362 2363 dchar[1] buf; 2364 2365 assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000'); 2366 assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF'); 2367 assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000'); 2368 assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF'); 2369 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2370 2371 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2372 assert(buf[0] == replacementDchar); 2373 } 2374 2375 @safe unittest 2376 { 2377 import std.exception; 2378 assertCTFEable!( 2379 { 2380 char[4] buf; 2381 2382 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2383 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2384 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2385 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF"); 2386 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800"); 2387 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF"); 2388 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2389 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2390 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF"); 2391 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000"); 2392 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF"); 2393 2394 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2395 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2396 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2397 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2398 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2399 2400 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2401 enum replacementDcharString = "\uFFFD"; 2402 assert(buf[0 .. replacementDcharString.length] == replacementDcharString); 2403 }); 2404 } 2405 2406 2407 /// Ditto 2408 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2409 out wchar[2] buf, dchar c) @safe pure 2410 { 2411 if (c <= 0xFFFF) 2412 { 2413 if (0xD800 <= c && c <= 0xDFFF) 2414 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2415 2416 assert(isValidDchar(c)); 2417 L1: 2418 buf[0] = cast(wchar) c; 2419 return 1; 2420 } 2421 if (c <= 0x10FFFF) 2422 { 2423 assert(isValidDchar(c)); 2424 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2425 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2426 return 2; 2427 } 2428 2429 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2430 goto L1; 2431 } 2432 2433 @safe unittest 2434 { 2435 import std.exception; 2436 assertCTFEable!( 2437 { 2438 wchar[2] buf; 2439 2440 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2441 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2442 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2443 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE); 2444 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF); 2445 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2446 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2447 2448 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2449 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2450 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2451 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2452 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2453 2454 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2455 assert(buf.front == replacementDchar); 2456 }); 2457 } 2458 2459 2460 /// Ditto 2461 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2462 out dchar[1] buf, dchar c) @safe pure 2463 { 2464 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2465 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2466 else 2467 assert(isValidDchar(c)); 2468 buf[0] = c; 2469 return 1; 2470 } 2471 2472 @safe unittest 2473 { 2474 import std.exception; 2475 assertCTFEable!( 2476 { 2477 dchar[1] buf; 2478 2479 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2480 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF'); 2481 encode(buf, '\uE000'); assert(buf[0] == '\uE000'); 2482 encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE); 2483 encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF); 2484 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF'); 2485 2486 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2487 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2488 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2489 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2490 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2491 2492 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2493 assert(buf.front == replacementDchar); 2494 }); 2495 } 2496 2497 2498 /++ 2499 Encodes `c` in `str`'s encoding and appends it to `str`. 2500 2501 Throws: 2502 `UTFException` if `c` is not a valid UTF code point. 2503 +/ 2504 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2505 ref char[] str, dchar c) @safe pure 2506 { 2507 char[] r = str; 2508 2509 if (c <= 0x7F) 2510 { 2511 assert(isValidDchar(c)); 2512 r ~= cast(char) c; 2513 } 2514 else 2515 { 2516 char[4] buf; 2517 uint L; 2518 2519 if (c <= 0x7FF) 2520 { 2521 assert(isValidDchar(c)); 2522 buf[0] = cast(char)(0xC0 | (c >> 6)); 2523 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2524 L = 2; 2525 } 2526 else if (c <= 0xFFFF) 2527 { 2528 if (0xD800 <= c && c <= 0xDFFF) 2529 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2530 2531 assert(isValidDchar(c)); 2532 L3: 2533 buf[0] = cast(char)(0xE0 | (c >> 12)); 2534 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2535 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2536 L = 3; 2537 } 2538 else if (c <= 0x10FFFF) 2539 { 2540 assert(isValidDchar(c)); 2541 buf[0] = cast(char)(0xF0 | (c >> 18)); 2542 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2543 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2544 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2545 L = 4; 2546 } 2547 else 2548 { 2549 assert(!isValidDchar(c)); 2550 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2551 goto L3; 2552 } 2553 r ~= buf[0 .. L]; 2554 } 2555 str = r; 2556 } 2557 2558 /// 2559 @safe unittest 2560 { 2561 char[] s = "abcd".dup; 2562 dchar d1 = 'a'; 2563 dchar d2 = 'ø'; 2564 2565 encode(s, d1); 2566 assert(s.length == 5); 2567 assert(s == "abcda"); 2568 encode(s, d2); 2569 assert(s.length == 7); 2570 assert(s == "abcdaø"); 2571 } 2572 2573 @safe unittest 2574 { 2575 import std.exception; 2576 2577 assertCTFEable!( 2578 { 2579 char[] s = "abcd".dup; 2580 encode(s, cast(dchar)'a'); 2581 assert(s.length == 5); 2582 assert(s == "abcda"); 2583 2584 encode(s, cast(dchar)'\u00A9'); 2585 assert(s.length == 7); 2586 assert(s == "abcda\xC2\xA9"); 2587 //assert(s == "abcda\u00A9"); // BUG: fix compiler 2588 2589 encode(s, cast(dchar)'\u2260'); 2590 assert(s.length == 10); 2591 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 2592 }); 2593 } 2594 2595 @safe unittest 2596 { 2597 import std.exception; 2598 assertCTFEable!( 2599 { 2600 char[] buf; 2601 2602 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000"); 2603 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F"); 2604 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080"); 2605 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF"); 2606 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800"); 2607 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF"); 2608 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000"); 2609 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE"); 2610 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF"); 2611 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000"); 2612 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF"); 2613 2614 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2615 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2616 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2617 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2618 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2619 2620 enum replacementDcharString = "\uFFFD"; 2621 enum rdcslen = replacementDcharString.length; 2622 assert(buf[$ - rdcslen .. $] != replacementDcharString); 2623 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2624 assert(buf[$ - rdcslen .. $] == replacementDcharString); 2625 }); 2626 } 2627 2628 /// ditto 2629 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2630 ref wchar[] str, dchar c) @safe pure 2631 { 2632 wchar[] r = str; 2633 2634 if (c <= 0xFFFF) 2635 { 2636 if (0xD800 <= c && c <= 0xDFFF) 2637 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2638 2639 assert(isValidDchar(c)); 2640 L1: 2641 r ~= cast(wchar) c; 2642 } 2643 else if (c <= 0x10FFFF) 2644 { 2645 wchar[2] buf; 2646 2647 assert(isValidDchar(c)); 2648 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2649 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2650 r ~= buf; 2651 } 2652 else 2653 { 2654 assert(!isValidDchar(c)); 2655 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2656 goto L1; 2657 } 2658 2659 str = r; 2660 } 2661 2662 @safe unittest 2663 { 2664 import std.exception; 2665 assertCTFEable!( 2666 { 2667 wchar[] buf; 2668 2669 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2670 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2671 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2672 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2673 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2674 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000"); 2675 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF"); 2676 2677 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2678 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2679 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2680 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2681 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2682 2683 assert(buf.back != replacementDchar); 2684 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2685 assert(buf.back == replacementDchar); 2686 }); 2687 } 2688 2689 /// ditto 2690 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2691 ref dchar[] str, dchar c) @safe pure 2692 { 2693 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2694 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2695 else 2696 assert(isValidDchar(c)); 2697 str ~= c; 2698 } 2699 2700 @safe unittest 2701 { 2702 import std.exception; 2703 assertCTFEable!( 2704 { 2705 dchar[] buf; 2706 2707 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2708 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2709 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2710 encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE); 2711 encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF); 2712 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF'); 2713 2714 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2715 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2716 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2717 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2718 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2719 2720 assert(buf.back != replacementDchar); 2721 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2722 assert(buf.back == replacementDchar); 2723 }); 2724 } 2725 2726 2727 /++ 2728 Returns the number of code units that are required to encode the code point 2729 `c` when `C` is the character type used to encode it. 2730 +/ 2731 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc 2732 if (isSomeChar!C) 2733 { 2734 static if (C.sizeof == 1) 2735 { 2736 if (c <= 0x7F) return 1; 2737 if (c <= 0x7FF) return 2; 2738 if (c <= 0xFFFF) return 3; 2739 if (c <= 0x10FFFF) return 4; 2740 assert(false); 2741 } 2742 else static if (C.sizeof == 2) 2743 { 2744 return c <= 0xFFFF ? 1 : 2; 2745 } 2746 else 2747 { 2748 static assert(C.sizeof == 4); 2749 return 1; 2750 } 2751 } 2752 2753 /// 2754 @safe pure nothrow @nogc unittest 2755 { 2756 assert(codeLength!char('a') == 1); 2757 assert(codeLength!wchar('a') == 1); 2758 assert(codeLength!dchar('a') == 1); 2759 2760 assert(codeLength!char('\U0010FFFF') == 4); 2761 assert(codeLength!wchar('\U0010FFFF') == 2); 2762 assert(codeLength!dchar('\U0010FFFF') == 1); 2763 } 2764 2765 2766 /++ 2767 Returns the number of code units that are required to encode `str` 2768 in a string whose character type is `C`. This is particularly useful 2769 when slicing one string with the length of another and the two string 2770 types use different character types. 2771 2772 Params: 2773 C = the character type to get the encoding length for 2774 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 2775 to calculate the encoding length from 2776 Returns: 2777 The number of code units in `input` when encoded to `C` 2778 +/ 2779 size_t codeLength(C, InputRange)(InputRange input) 2780 if (isInputRange!InputRange && !isInfinite!InputRange && isSomeChar!(ElementType!InputRange)) 2781 { 2782 alias EncType = Unqual!(ElementEncodingType!InputRange); 2783 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length))) 2784 return input.length; 2785 else 2786 { 2787 size_t total = 0; 2788 2789 foreach (c; input.byDchar) 2790 total += codeLength!C(c); 2791 2792 return total; 2793 } 2794 } 2795 2796 /// 2797 @safe unittest 2798 { 2799 assert(codeLength!char("hello world") == 2800 "hello world".length); 2801 assert(codeLength!wchar("hello world") == 2802 "hello world"w.length); 2803 assert(codeLength!dchar("hello world") == 2804 "hello world"d.length); 2805 2806 assert(codeLength!char(`プログラミング`) == 2807 `プログラミング`.length); 2808 assert(codeLength!wchar(`プログラミング`) == 2809 `プログラミング`w.length); 2810 assert(codeLength!dchar(`プログラミング`) == 2811 `プログラミング`d.length); 2812 2813 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`; 2814 wstring needle = `Être sans la verité`; 2815 assert(haystack[codeLength!char(needle) .. $] == 2816 `, ça, ce ne serait pas bien.`); 2817 } 2818 2819 @safe unittest 2820 { 2821 import std.algorithm.iteration : filter; 2822 import std.conv : to; 2823 import std.exception; 2824 2825 assertCTFEable!( 2826 { 2827 foreach (S; AliasSeq!( char[], const char[], string, 2828 wchar[], const wchar[], wstring, 2829 dchar[], const dchar[], dstring)) 2830 { 2831 foreach (C; AliasSeq!(char, wchar, dchar)) 2832 { 2833 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length); 2834 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length); 2835 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) == 2836 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2837 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) == 2838 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2839 } 2840 } 2841 }); 2842 } 2843 2844 /+ 2845 Internal helper function: 2846 2847 Returns true if it is safe to search for the Codepoint `c` inside 2848 code units, without decoding. 2849 2850 This is a runtime check that is used an optimization in various functions, 2851 particularly, in `std.string`. 2852 +/ 2853 package bool canSearchInCodeUnits(C)(dchar c) 2854 if (isSomeChar!C) 2855 { 2856 static if (C.sizeof == 1) 2857 return c <= 0x7F; 2858 else static if (C.sizeof == 2) 2859 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF); 2860 else static if (C.sizeof == 4) 2861 return true; 2862 else 2863 static assert(0); 2864 } 2865 @safe unittest 2866 { 2867 assert( canSearchInCodeUnits! char('a')); 2868 assert( canSearchInCodeUnits!wchar('a')); 2869 assert( canSearchInCodeUnits!dchar('a')); 2870 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF 2871 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF 2872 assert( canSearchInCodeUnits!wchar('ö')); 2873 assert( canSearchInCodeUnits!dchar('ö')); 2874 assert(!canSearchInCodeUnits! char('日')); 2875 assert( canSearchInCodeUnits!wchar('日')); 2876 assert( canSearchInCodeUnits!dchar('日')); 2877 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00)); 2878 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00)); 2879 assert(!canSearchInCodeUnits! char('\U00010001')); 2880 assert(!canSearchInCodeUnits!wchar('\U00010001')); 2881 assert( canSearchInCodeUnits!dchar('\U00010001')); 2882 } 2883 2884 /* =================== Validation ======================= */ 2885 2886 /++ 2887 Checks to see if `str` is well-formed unicode or not. 2888 2889 Throws: 2890 `UTFException` if `str` is not well-formed. 2891 +/ 2892 void validate(S)(in S str) @safe pure 2893 if (isSomeString!S) 2894 { 2895 immutable len = str.length; 2896 for (size_t i = 0; i < len; ) 2897 { 2898 decode(str, i); 2899 } 2900 } 2901 2902 /// 2903 @safe unittest 2904 { 2905 import std.exception : assertThrown; 2906 char[] a = [167, 133, 175]; 2907 assertThrown!UTFException(validate(a)); 2908 } 2909 2910 // https://issues.dlang.org/show_bug.cgi?id=12923 2911 @safe unittest 2912 { 2913 import std.exception; 2914 assertThrown((){ 2915 char[3]a=[167, 133, 175]; 2916 validate(a[]); 2917 }()); 2918 } 2919 2920 /** 2921 * Encodes the elements of `s` to UTF-8 and returns a newly allocated 2922 * string of the elements. 2923 * 2924 * Params: 2925 * s = the string to encode 2926 * Returns: 2927 * A UTF-8 string 2928 * See_Also: 2929 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2930 */ 2931 string toUTF8(S)(S s) 2932 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 2933 { 2934 return toUTFImpl!string(s); 2935 } 2936 2937 /// 2938 @safe pure unittest 2939 { 2940 import std.algorithm.comparison : equal; 2941 2942 // The ö is represented by two UTF-8 code units 2943 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2944 2945 // 𐐷 is four code units in UTF-8 2946 assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2947 } 2948 2949 @system pure unittest 2950 { 2951 import std.algorithm.comparison : equal; 2952 import std.internal.test.dummyrange : ReferenceInputRange; 2953 2954 alias RT = ReferenceInputRange!(ElementType!(string)); 2955 auto r1 = new RT("Hellø"); 2956 auto r2 = new RT("𐐷"); 2957 2958 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2959 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2960 } 2961 2962 /** 2963 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated 2964 * `wstring` of the elements. 2965 * 2966 * Params: 2967 * s = the range to encode 2968 * Returns: 2969 * A UTF-16 string 2970 * See_Also: 2971 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2972 */ 2973 wstring toUTF16(S)(S s) 2974 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 2975 { 2976 return toUTFImpl!wstring(s); 2977 } 2978 2979 /// 2980 @safe pure unittest 2981 { 2982 import std.algorithm.comparison : equal; 2983 2984 // these graphemes are two code units in UTF-16 and one in UTF-32 2985 assert("𤭢"d.length == 1); 2986 assert("𐐷"d.length == 1); 2987 2988 assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62])); 2989 assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37])); 2990 } 2991 2992 @system pure unittest 2993 { 2994 import std.algorithm.comparison : equal; 2995 import std.internal.test.dummyrange : ReferenceInputRange; 2996 2997 alias RT = ReferenceInputRange!(ElementType!(string)); 2998 auto r1 = new RT("𤭢"); 2999 auto r2 = new RT("𐐷"); 3000 3001 assert(r1.toUTF16.equal([0xD852, 0xDF62])); 3002 assert(r2.toUTF16.equal([0xD801, 0xDC37])); 3003 } 3004 3005 3006 /** 3007 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated 3008 * `dstring` of the elements. 3009 * 3010 * Params: 3011 * s = the range to encode 3012 * Returns: 3013 * A UTF-32 string 3014 * See_Also: 3015 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3016 */ 3017 dstring toUTF32(S)(S s) 3018 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S)) 3019 { 3020 return toUTFImpl!dstring(s); 3021 } 3022 3023 /// 3024 @safe pure unittest 3025 { 3026 import std.algorithm.comparison : equal; 3027 3028 // these graphemes are two code units in UTF-16 and one in UTF-32 3029 assert("𤭢"w.length == 2); 3030 assert("𐐷"w.length == 2); 3031 3032 assert("𤭢"w.toUTF32.equal([0x00024B62])); 3033 assert("𐐷"w.toUTF32.equal([0x00010437])); 3034 } 3035 3036 private T toUTFImpl(T, S)(S s) 3037 { 3038 static if (is(S : T)) 3039 { 3040 return s.idup; 3041 } 3042 else 3043 { 3044 import std.array : appender; 3045 auto app = appender!T(); 3046 3047 static if (hasLength!S || isSomeString!S) 3048 app.reserve(s.length); 3049 3050 foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T))) 3051 app.put(c); 3052 3053 return app.data; 3054 } 3055 } 3056 3057 /* =================== toUTFz ======================= */ 3058 3059 /++ 3060 Returns a C-style zero-terminated string equivalent to `str`. `str` 3061 must not contain embedded `'\0'`'s as any C function will treat the first 3062 `'\0'` that it sees as the end of the string. If `str.empty` is 3063 `true`, then a string containing only `'\0'` is returned. 3064 3065 `toUTFz` accepts any type of string and is templated on the type of 3066 character pointer that you wish to convert to. It will avoid allocating a 3067 new string if it can, but there's a decent chance that it will end up having 3068 to allocate a new string - particularly when dealing with character types 3069 other than `char`. 3070 3071 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if 3072 anything alters the character one past the end of `str` (which is the 3073 `'\0'` character terminating the string), then the string won't be 3074 zero-terminated anymore. The most likely scenarios for that are if you 3075 append to `str` and no reallocation takes place or when `str` is a 3076 slice of a larger array, and you alter the character in the larger array 3077 which is one character past the end of `str`. Another case where it could 3078 occur would be if you had a mutable character array immediately after 3079 `str` in memory (for example, if they're member variables in a 3080 user-defined type with one declared right after the other) and that 3081 character array happened to start with `'\0'`. Such scenarios will never 3082 occur if you immediately use the zero-terminated string after calling 3083 `toUTFz` and the C function using it doesn't keep a reference to it. 3084 Also, they are unlikely to occur even if you save the zero-terminated string 3085 (the cases above would be among the few examples of where it could happen). 3086 However, if you save the zero-terminate string and want to be absolutely 3087 certain that the string stays zero-terminated, then simply append a 3088 `'\0'` to the string and use its `ptr` property rather than calling 3089 `toUTFz`. 3090 3091 $(RED Warning 2:) When passing a character pointer to a C function, and the 3092 C function keeps it around for any reason, make sure that you keep a 3093 reference to it in your D code. Otherwise, it may go away during a garbage 3094 collection cycle and cause a nasty bug when the C code tries to use it. 3095 +/ 3096 template toUTFz(P) 3097 { 3098 P toUTFz(S)(S str) @safe pure 3099 { 3100 return toUTFzImpl!(P, S)(str); 3101 } 3102 } 3103 3104 /// 3105 @safe pure unittest 3106 { 3107 auto p1 = toUTFz!(char*)("hello world"); 3108 auto p2 = toUTFz!(const(char)*)("hello world"); 3109 auto p3 = toUTFz!(immutable(char)*)("hello world"); 3110 auto p4 = toUTFz!(char*)("hello world"d); 3111 auto p5 = toUTFz!(const(wchar)*)("hello world"); 3112 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w); 3113 } 3114 3115 private P toUTFzImpl(P, S)(S str) @safe pure 3116 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 3117 is(immutable typeof(*P.init) == immutable ElementEncodingType!S) && 3118 is(immutable ElementEncodingType!S == ElementEncodingType!S)) 3119 //immutable(C)[] -> C*, const(C)*, or immutable(C)* 3120 { 3121 if (str.empty) 3122 { 3123 typeof(*P.init)[] retval = ['\0']; 3124 3125 auto trustedPtr() @trusted { return retval.ptr; } 3126 return trustedPtr(); 3127 } 3128 3129 alias C = Unqual!(ElementEncodingType!S); 3130 3131 //If the P is mutable, then we have to make a copy. 3132 static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init))) 3133 { 3134 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3135 } 3136 else 3137 { 3138 if (!__ctfe) 3139 { 3140 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3141 immutable p = trustedPtrAdd(str); 3142 3143 // Peek past end of str, if it's 0, no conversion necessary. 3144 // Note that the compiler will put a 0 past the end of static 3145 // strings, and the storage allocator will put a 0 past the end 3146 // of newly allocated char[]'s. 3147 // Is p dereferenceable? A simple test: if the p points to an 3148 // address multiple of 4, then conservatively assume the pointer 3149 // might be pointing to a new block of memory, which might be 3150 // unreadable. Otherwise, it's definitely pointing to valid 3151 // memory. 3152 if ((cast(size_t) p & 3) && *p == '\0') 3153 return &str[0]; 3154 } 3155 3156 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3157 } 3158 } 3159 3160 private P toUTFzImpl(P, S)(S str) @safe pure 3161 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 3162 is(immutable typeof(*P.init) == immutable ElementEncodingType!S) && 3163 !is(immutable ElementEncodingType!S == ElementEncodingType!S)) 3164 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)* 3165 { 3166 alias InChar = ElementEncodingType!S; 3167 alias OutChar = typeof(*P.init); 3168 3169 //const(C)[] -> const(C)* or 3170 //C[] -> C* or const(C)* 3171 static if (( is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) || 3172 (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar))) 3173 { 3174 if (!__ctfe) 3175 { 3176 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3177 auto p = trustedPtrAdd(str); 3178 3179 if ((cast(size_t) p & 3) && *p == '\0') 3180 return &str[0]; 3181 } 3182 3183 str ~= '\0'; 3184 return &str[0]; 3185 } 3186 //const(C)[] -> C* or immutable(C)* or 3187 //C[] -> immutable(C)* 3188 else 3189 { 3190 import std.array : uninitializedArray; 3191 auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1); 3192 copy[0 .. $ - 1] = str[]; 3193 copy[$ - 1] = '\0'; 3194 3195 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } 3196 return trustedCast(copy); 3197 } 3198 } 3199 3200 private P toUTFzImpl(P, S)(S str) @safe pure 3201 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) && 3202 !is(immutable typeof(*P.init) == immutable ElementEncodingType!S)) 3203 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)* 3204 { 3205 import std.array : appender; 3206 auto retval = appender!(typeof(*P.init)[])(); 3207 3208 foreach (dchar c; str) 3209 retval.put(c); 3210 retval.put('\0'); 3211 3212 return () @trusted { return cast(P) retval.data.ptr; } (); 3213 } 3214 3215 @safe pure unittest 3216 { 3217 import core.exception : AssertError; 3218 import std.algorithm; 3219 import std.conv : to; 3220 import std.exception; 3221 import std..string : format; 3222 3223 assertCTFEable!( 3224 { 3225 foreach (S; AliasSeq!(string, wstring, dstring)) 3226 { 3227 alias C = Unqual!(ElementEncodingType!S); 3228 3229 auto s1 = to!S("hello\U00010143\u0100\U00010143"); 3230 auto temp = new C[](s1.length + 1); 3231 temp[0 .. $ - 1] = s1[0 .. $]; 3232 temp[$ - 1] = '\n'; 3233 --temp.length; 3234 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); } 3235 auto s2 = trustedAssumeUnique(temp); 3236 assert(s1 == s2); 3237 3238 void trustedCStringAssert(P, S)(S s) @trusted 3239 { 3240 auto p = toUTFz!P(s); 3241 assert(p[0 .. s.length] == s); 3242 assert(p[s.length] == '\0'); 3243 } 3244 3245 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*)) 3246 { 3247 trustedCStringAssert!P(s1); 3248 trustedCStringAssert!P(s2); 3249 } 3250 } 3251 }); 3252 3253 static void test(P, S)(S s, size_t line = __LINE__) @trusted 3254 { 3255 static size_t zeroLen(C)(const(C)* ptr) @trusted 3256 { 3257 size_t len = 0; 3258 while (*ptr != '\0') { ++ptr; ++len; } 3259 return len; 3260 } 3261 3262 auto p = toUTFz!P(s); 3263 immutable len = zeroLen(p); 3264 enforce(cmp(s, p[0 .. len]) == 0, 3265 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof), 3266 __FILE__, line)); 3267 } 3268 3269 assertCTFEable!( 3270 { 3271 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*, 3272 dchar*, const(dchar)*, immutable(dchar)*)) 3273 { 3274 test!P("hello\U00010143\u0100\U00010143"); 3275 } 3276 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3277 dchar*, const(dchar)*, immutable(dchar)*)) 3278 { 3279 test!P("hello\U00010143\u0100\U00010143"w); 3280 } 3281 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3282 wchar*, const(wchar)*, immutable(wchar)*)) 3283 { 3284 test!P("hello\U00010143\u0100\U00010143"d); 3285 } 3286 foreach (S; AliasSeq!( char[], const( char)[], 3287 wchar[], const(wchar)[], 3288 dchar[], const(dchar)[])) 3289 { 3290 auto s = to!S("hello\U00010143\u0100\U00010143"); 3291 3292 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3293 wchar*, const(wchar)*, immutable(wchar)*, 3294 dchar*, const(dchar)*, immutable(dchar)*)) 3295 { 3296 test!P(s); 3297 } 3298 } 3299 }); 3300 } 3301 3302 3303 /++ 3304 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`. 3305 3306 Encodes string `s` into UTF-16 and returns the encoded string. 3307 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API 3308 that take an `LPCWSTR` argument. 3309 +/ 3310 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure 3311 if (isSomeChar!C) 3312 { 3313 return toUTFz!(const(wchar)*)(str); 3314 } 3315 3316 /// 3317 @system unittest 3318 { 3319 string str = "Hello, World!"; 3320 const(wchar)* p = str.toUTF16z; 3321 assert(p[str.length] == '\0'); 3322 } 3323 3324 @safe pure unittest 3325 { 3326 import std.conv : to; 3327 //toUTFz is already thoroughly tested, so this will just verify that 3328 //toUTF16z compiles properly for the various string types. 3329 foreach (S; AliasSeq!(string, wstring, dstring)) 3330 assert(toUTF16z(to!S("hello world")) !is null); 3331 } 3332 3333 3334 /* ================================ tests ================================== */ 3335 3336 @safe pure unittest 3337 { 3338 import std.exception; 3339 3340 assertCTFEable!( 3341 { 3342 assert(toUTF16("hello"c) == "hello"); 3343 assert(toUTF32("hello"c) == "hello"); 3344 assert(toUTF8 ("hello"w) == "hello"); 3345 assert(toUTF32("hello"w) == "hello"); 3346 assert(toUTF8 ("hello"d) == "hello"); 3347 assert(toUTF16("hello"d) == "hello"); 3348 3349 assert(toUTF16("hel\u1234o"c) == "hel\u1234o"); 3350 assert(toUTF32("hel\u1234o"c) == "hel\u1234o"); 3351 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o"); 3352 assert(toUTF32("hel\u1234o"w) == "hel\u1234o"); 3353 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o"); 3354 assert(toUTF16("hel\u1234o"d) == "hel\u1234o"); 3355 3356 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3357 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3358 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3359 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3360 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3361 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3362 }); 3363 } 3364 3365 3366 /++ 3367 Returns the total number of code points encoded in `str`. 3368 3369 Supercedes: This function supercedes $(LREF toUCSindex). 3370 3371 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 3372 3373 Throws: 3374 `UTFException` if `str` is not well-formed. 3375 +/ 3376 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc 3377 if (isSomeChar!C) 3378 { 3379 return walkLength(str.byDchar); 3380 } 3381 3382 /// 3383 @safe pure nothrow @nogc unittest 3384 { 3385 assert(count("") == 0); 3386 assert(count("a") == 1); 3387 assert(count("abc") == 3); 3388 assert(count("\u20AC100") == 4); 3389 } 3390 3391 @safe pure nothrow @nogc unittest 3392 { 3393 import std.exception; 3394 assertCTFEable!( 3395 { 3396 assert(count("") == 0); 3397 assert(count("a") == 1); 3398 assert(count("abc") == 3); 3399 assert(count("\u20AC100") == 4); 3400 }); 3401 } 3402 3403 3404 // Ranges of code units for testing. 3405 version (StdUnittest) 3406 { 3407 private: 3408 struct InputCU(C) 3409 { 3410 import std.conv : to; 3411 @property bool empty() { return _str.empty; } 3412 @property C front() { return _str[0]; } 3413 void popFront() { _str = _str[1 .. $]; } 3414 3415 this(inout(C)[] str) 3416 { 3417 _str = to!(C[])(str); 3418 } 3419 3420 C[] _str; 3421 } 3422 3423 struct BidirCU(C) 3424 { 3425 import std.conv : to; 3426 @property bool empty() { return _str.empty; } 3427 @property C front() { return _str[0]; } 3428 void popFront() { _str = _str[1 .. $]; } 3429 @property C back() { return _str[$ - 1]; } 3430 void popBack() { _str = _str[0 .. $ - 1]; } 3431 @property auto save() { return BidirCU(_str); } 3432 @property size_t length() { return _str.length; } 3433 3434 this(inout(C)[] str) 3435 { 3436 _str = to!(C[])(str); 3437 } 3438 3439 C[] _str; 3440 } 3441 3442 struct RandomCU(C) 3443 { 3444 import std.conv : to; 3445 @property bool empty() { return _str.empty; } 3446 @property C front() { return _str[0]; } 3447 void popFront() { _str = _str[1 .. $]; } 3448 @property C back() { return _str[$ - 1]; } 3449 void popBack() { _str = _str[0 .. $ - 1]; } 3450 @property auto save() { return RandomCU(_str); } 3451 @property size_t length() { return _str.length; } 3452 C opIndex(size_t i) { return _str[i]; } 3453 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); } 3454 3455 this(inout(C)[] str) 3456 { 3457 _str = to!(C[])(str); 3458 } 3459 3460 C[] _str; 3461 } 3462 3463 class RefBidirCU(C) 3464 { 3465 import std.conv : to; 3466 @property bool empty() { return _str.empty; } 3467 @property C front() { return _str[0]; } 3468 void popFront() { _str = _str[1 .. $]; } 3469 @property C back() { return _str[$ - 1]; } 3470 void popBack() { _str = _str[0 .. $ - 1]; } 3471 @property auto save() { return new RefBidirCU(_str); } 3472 @property size_t length() { return _str.length; } 3473 3474 this(inout(C)[] str) 3475 { 3476 _str = to!(C[])(str); 3477 } 3478 3479 C[] _str; 3480 } 3481 3482 class RefRandomCU(C) 3483 { 3484 import std.conv : to; 3485 @property bool empty() { return _str.empty; } 3486 @property C front() { return _str[0]; } 3487 void popFront() { _str = _str[1 .. $]; } 3488 @property C back() { return _str[$ - 1]; } 3489 void popBack() { _str = _str[0 .. $ - 1]; } 3490 @property auto save() { return new RefRandomCU(_str); } 3491 @property size_t length() { return _str.length; } 3492 C opIndex(size_t i) { return _str[i]; } 3493 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); } 3494 3495 this(inout(C)[] str) 3496 { 3497 _str = to!(C[])(str); 3498 } 3499 3500 C[] _str; 3501 } 3502 } 3503 3504 3505 /** 3506 * Inserted in place of invalid UTF sequences. 3507 * 3508 * References: 3509 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character) 3510 */ 3511 enum dchar replacementDchar = '\uFFFD'; 3512 3513 /******************************************** 3514 * Iterate a range of char, wchar, or dchars by code unit. 3515 * 3516 * The purpose is to bypass the special case decoding that 3517 * $(REF front, std,range,primitives) does to character arrays. As a result, 3518 * using ranges with `byCodeUnit` can be `nothrow` while 3519 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode 3520 * sequences. 3521 * 3522 * A code unit is a building block of the UTF encodings. Generally, an 3523 * individual code unit does not represent what's perceived as a full 3524 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters 3525 * are encoded with multiple code units. For example, the UTF-8 code units for 3526 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit` 3527 * often does not form a character on its own. Attempting to treat it as 3528 * one while iterating over the resulting range will give nonsensical results. 3529 * 3530 * Params: 3531 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 3532 * of characters (including strings) or a type that implicitly converts to a string type. 3533 * Returns: 3534 * If `r` is not an auto-decodable string (i.e. a narrow string or a 3535 * user-defined type that implicits converts to a string type), then `r` 3536 * is returned. 3537 * 3538 * Otherwise, `r` is converted to its corresponding string type (if it's 3539 * not already a string) and wrapped in a random-access range where the 3540 * element encoding type of the string (its code unit) is the element type 3541 * of the range, and that range returned. The range has slicing. 3542 * 3543 * If `r` is quirky enough to be a struct or class which is an input range 3544 * of characters on its own (i.e. it has the input range API as member 3545 * functions), $(I and) it's implicitly convertible to a string type, then 3546 * `r` is returned, and no implicit conversion takes place. 3547 * 3548 * If `r` is wrapped in a new range, then that range has a `source` 3549 * property for returning the string that's currently contained within that 3550 * range. 3551 * 3552 * See_Also: 3553 * Refer to the $(MREF std, uni) docs for a reference on Unicode 3554 * terminology. 3555 * 3556 * For a range that iterates by grapheme cluster (written character) see 3557 * $(REF byGrapheme, std,uni). 3558 */ 3559 auto byCodeUnit(R)(R r) 3560 if ((isConvertibleToString!R && !isStaticArray!R) || 3561 (isInputRange!R && isSomeChar!(ElementEncodingType!R))) 3562 { 3563 import std.traits : StringTypeOf; 3564 static if (// This would be cleaner if we had a way to check whether a type 3565 // was a range without any implicit conversions. 3566 (isAutodecodableString!R && !__traits(hasMember, R, "empty") && 3567 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3568 { 3569 static struct ByCodeUnitImpl 3570 { 3571 @safe pure nothrow @nogc: 3572 3573 @property bool empty() const { return source.length == 0; } 3574 @property auto ref front() inout { return source[0]; } 3575 void popFront() { source = source[1 .. $]; } 3576 3577 @property auto save() { return ByCodeUnitImpl(source.save); } 3578 3579 @property auto ref back() inout { return source[$ - 1]; } 3580 void popBack() { source = source[0 .. $-1]; } 3581 3582 auto ref opIndex(size_t index) inout { return source[index]; } 3583 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); } 3584 3585 @property size_t length() const { return source.length; } 3586 alias opDollar = length; 3587 3588 StringTypeOf!R source; 3589 } 3590 3591 static assert(isRandomAccessRange!ByCodeUnitImpl); 3592 3593 return ByCodeUnitImpl(r); 3594 } 3595 else static if (!isInputRange!R || 3596 (is(R : const dchar[]) && !__traits(hasMember, R, "empty") && 3597 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3598 { 3599 return cast(StringTypeOf!R) r; 3600 } 3601 else 3602 { 3603 // byCodeUnit for ranges and dchar[] is a no-op 3604 return r; 3605 } 3606 } 3607 3608 /// 3609 @safe unittest 3610 { 3611 import std.range.primitives; 3612 import std.traits : isAutodecodableString; 3613 3614 auto r = "Hello, World!".byCodeUnit(); 3615 static assert(hasLength!(typeof(r))); 3616 static assert(hasSlicing!(typeof(r))); 3617 static assert(isRandomAccessRange!(typeof(r))); 3618 static assert(is(ElementType!(typeof(r)) == immutable char)); 3619 3620 // contrast with the range capabilities of standard strings (with or 3621 // without autodecoding enabled). 3622 auto s = "Hello, World!"; 3623 static assert(isBidirectionalRange!(typeof(r))); 3624 static if (isAutodecodableString!(typeof(s))) 3625 { 3626 // with autodecoding enabled, strings are non-random-access ranges of 3627 // dchar. 3628 static assert(is(ElementType!(typeof(s)) == dchar)); 3629 static assert(!isRandomAccessRange!(typeof(s))); 3630 static assert(!hasSlicing!(typeof(s))); 3631 static assert(!hasLength!(typeof(s))); 3632 } 3633 else 3634 { 3635 // without autodecoding, strings are normal arrays. 3636 static assert(is(ElementType!(typeof(s)) == immutable char)); 3637 static assert(isRandomAccessRange!(typeof(s))); 3638 static assert(hasSlicing!(typeof(s))); 3639 static assert(hasLength!(typeof(s))); 3640 } 3641 } 3642 3643 /// `byCodeUnit` does no Unicode decoding 3644 @safe unittest 3645 { 3646 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis 3647 assert(noel1.byCodeUnit[2] != 'ë'); 3648 assert(noel1.byCodeUnit[2] == 'e'); 3649 3650 string noel2 = "no\u00EBl"; // noël using a precomposed ë character 3651 // Because string is UTF-8, the code unit at index 2 is just 3652 // the first of a sequence that encodes 'ë' 3653 assert(noel2.byCodeUnit[2] != 'ë'); 3654 } 3655 3656 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings. 3657 @safe unittest 3658 { 3659 import std.algorithm.comparison : equal; 3660 import std.range : popFrontN; 3661 import std.traits : isAutodecodableString; 3662 { 3663 auto range = byCodeUnit("hello world"); 3664 range.popFrontN(3); 3665 assert(equal(range.save, "lo world")); 3666 static if (isAutodecodableString!string) // only enabled with autodecoding 3667 { 3668 string str = range.source; 3669 assert(str == "lo world"); 3670 } 3671 } 3672 // source only exists if the range was wrapped 3673 { 3674 auto range = byCodeUnit("hello world"d); 3675 static assert(!__traits(compiles, range.source)); 3676 } 3677 } 3678 3679 @safe pure nothrow @nogc unittest 3680 { 3681 import std.range; 3682 { 3683 enum testStr = "𐁄𐂌𐃯 hello ディラン"; 3684 char[testStr.length] s; 3685 int i; 3686 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3687 { 3688 s[i++] = c; 3689 } 3690 assert(s == testStr); 3691 } 3692 { 3693 enum testStr = "𐁄𐂌𐃯 hello ディラン"w; 3694 wchar[testStr.length] s; 3695 int i; 3696 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3697 { 3698 s[i++] = c; 3699 } 3700 assert(s == testStr); 3701 } 3702 { 3703 enum testStr = "𐁄𐂌𐃯 hello ディラン"d; 3704 dchar[testStr.length] s; 3705 int i; 3706 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3707 { 3708 s[i++] = c; 3709 } 3710 assert(s == testStr); 3711 } 3712 { 3713 auto bcu = "hello".byCodeUnit(); 3714 assert(bcu.length == 5); 3715 assert(bcu[3] == 'l'); 3716 assert(bcu[2 .. 4][1] == 'l'); 3717 } 3718 { 3719 char[5] orig = "hello"; 3720 auto bcu = orig[].byCodeUnit(); 3721 bcu.front = 'H'; 3722 assert(bcu.front == 'H'); 3723 bcu[1] = 'E'; 3724 assert(bcu[1] == 'E'); 3725 } 3726 { 3727 auto bcu = "hello".byCodeUnit().byCodeUnit(); 3728 static assert(isForwardRange!(typeof(bcu))); 3729 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3730 auto s = bcu.save; 3731 bcu.popFront(); 3732 assert(s.front == 'h'); 3733 } 3734 { 3735 auto bcu = "hello".byCodeUnit(); 3736 static assert(hasSlicing!(typeof(bcu))); 3737 static assert(isBidirectionalRange!(typeof(bcu))); 3738 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3739 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3740 auto ret = bcu.retro; 3741 assert(ret.front == 'o'); 3742 ret.popFront(); 3743 assert(ret.front == 'l'); 3744 } 3745 { 3746 auto bcu = "κόσμε"w.byCodeUnit(); 3747 static assert(hasSlicing!(typeof(bcu))); 3748 static assert(isBidirectionalRange!(typeof(bcu))); 3749 static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring); 3750 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3751 auto ret = bcu.retro; 3752 assert(ret.front == 'ε'); 3753 ret.popFront(); 3754 assert(ret.front == 'μ'); 3755 } 3756 { 3757 static struct Stringish 3758 { 3759 string s; 3760 alias s this; 3761 } 3762 3763 auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓"); 3764 auto bcu = orig.byCodeUnit(); 3765 static assert(is(typeof(bcu) == struct)); 3766 static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish); 3767 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3768 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3769 assert(bcu.front == cast(char) 244); 3770 } 3771 { 3772 static struct WStringish 3773 { 3774 wstring s; 3775 alias s this; 3776 } 3777 3778 auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w); 3779 auto bcu = orig.byCodeUnit(); 3780 static assert(is(typeof(bcu) == struct)); 3781 static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish); 3782 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3783 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3784 assert(bcu.front == cast(wchar) 56319); 3785 } 3786 { 3787 static struct DStringish 3788 { 3789 dstring s; 3790 alias s this; 3791 } 3792 3793 auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d); 3794 auto bcu = orig.byCodeUnit(); 3795 static assert(is(typeof(bcu) == dstring)); 3796 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3797 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3798 assert(bcu.front == cast(dchar) 1114104); 3799 } 3800 { 3801 static struct FuncStringish 3802 { 3803 string str; 3804 string s() pure nothrow @nogc { return str; } 3805 alias s this; 3806 } 3807 3808 auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓"); 3809 auto bcu = orig.byCodeUnit(); 3810 static if (isAutodecodableString!FuncStringish) 3811 static assert(is(typeof(bcu) == struct)); 3812 else 3813 static assert(is(typeof(bcu) == string)); 3814 static assert(!is(typeof(bcu) == FuncStringish)); 3815 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3816 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3817 assert(bcu.front == cast(char) 244); 3818 } 3819 { 3820 static struct Range 3821 { 3822 string data; 3823 bool empty() pure nothrow @nogc { return data.empty; } 3824 char front() pure nothrow @nogc { return data[0]; } 3825 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3826 } 3827 3828 auto orig = Range("\U0010fff8 𐁊 foo 𐂓"); 3829 auto bcu = orig.byCodeUnit(); 3830 static assert(is(typeof(bcu) == Range)); 3831 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3832 static assert(is(ElementType!(typeof(bcu)) == char)); 3833 assert(bcu.front == cast(char) 244); 3834 } 3835 { 3836 static struct WRange 3837 { 3838 wstring data; 3839 bool empty() pure nothrow @nogc { return data.empty; } 3840 wchar front() pure nothrow @nogc { return data[0]; } 3841 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3842 } 3843 3844 auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w); 3845 auto bcu = orig.byCodeUnit(); 3846 static assert(is(typeof(bcu) == WRange)); 3847 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3848 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3849 assert(bcu.front == 56319); 3850 } 3851 { 3852 static struct DRange 3853 { 3854 dstring data; 3855 bool empty() pure nothrow @nogc { return data.empty; } 3856 dchar front() pure nothrow @nogc { return data[0]; } 3857 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3858 } 3859 3860 auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d); 3861 auto bcu = orig.byCodeUnit(); 3862 static assert(is(typeof(bcu) == DRange)); 3863 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3864 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3865 assert(bcu.front == 1114104); 3866 } 3867 { 3868 static struct RangeAndStringish 3869 { 3870 bool empty() pure nothrow @nogc { return data.empty; } 3871 char front() pure nothrow @nogc { return data[0]; } 3872 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3873 3874 string data; 3875 string s; 3876 alias s this; 3877 } 3878 3879 auto orig = RangeAndStringish("test.d", "other"); 3880 auto bcu = orig.byCodeUnit(); 3881 static assert(is(typeof(bcu) == RangeAndStringish)); 3882 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3883 static assert(is(ElementType!(typeof(bcu)) == char)); 3884 assert(bcu.front == 't'); 3885 } 3886 { 3887 static struct WRangeAndStringish 3888 { 3889 bool empty() pure nothrow @nogc { return data.empty; } 3890 wchar front() pure nothrow @nogc { return data[0]; } 3891 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3892 3893 wstring data; 3894 wstring s; 3895 alias s this; 3896 } 3897 3898 auto orig = WRangeAndStringish("test.d"w, "other"w); 3899 auto bcu = orig.byCodeUnit(); 3900 static assert(is(typeof(bcu) == WRangeAndStringish)); 3901 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3902 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3903 assert(bcu.front == 't'); 3904 } 3905 { 3906 static struct DRangeAndStringish 3907 { 3908 bool empty() pure nothrow @nogc { return data.empty; } 3909 dchar front() pure nothrow @nogc { return data[0]; } 3910 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3911 3912 dstring data; 3913 dstring s; 3914 alias s this; 3915 } 3916 3917 auto orig = DRangeAndStringish("test.d"d, "other"d); 3918 auto bcu = orig.byCodeUnit(); 3919 static assert(is(typeof(bcu) == DRangeAndStringish)); 3920 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3921 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3922 assert(bcu.front == 't'); 3923 } 3924 { 3925 enum Enum : string { a = "test.d" } 3926 3927 auto orig = Enum.a; 3928 auto bcu = orig.byCodeUnit(); 3929 static assert(!is(typeof(bcu) == Enum)); 3930 static if (isAutodecodableString!Enum) 3931 static assert(is(typeof(bcu) == struct)); 3932 else 3933 static assert(is(typeof(bcu) == string)); 3934 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3935 assert(bcu.front == 't'); 3936 } 3937 { 3938 enum WEnum : wstring { a = "test.d"w } 3939 3940 auto orig = WEnum.a; 3941 auto bcu = orig.byCodeUnit(); 3942 static assert(!is(typeof(bcu) == WEnum)); 3943 static if (isAutodecodableString!WEnum) 3944 static assert(is(typeof(bcu) == struct)); 3945 else 3946 static assert(is(typeof(bcu) == wstring)); 3947 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3948 assert(bcu.front == 't'); 3949 } 3950 { 3951 enum DEnum : dstring { a = "test.d"d } 3952 3953 auto orig = DEnum.a; 3954 auto bcu = orig.byCodeUnit(); 3955 static assert(is(typeof(bcu) == dstring)); 3956 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3957 assert(bcu.front == 't'); 3958 } 3959 3960 static if (autodecodeStrings) 3961 { 3962 static assert(!is(typeof(byCodeUnit("hello")) == string)); 3963 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring)); 3964 } 3965 else 3966 { 3967 static assert(is(typeof(byCodeUnit("hello")) == string)); 3968 static assert(is(typeof(byCodeUnit("hello"w)) == wstring)); 3969 } 3970 static assert(is(typeof(byCodeUnit("hello"d)) == dstring)); 3971 3972 static assert(!__traits(compiles, byCodeUnit((char[5]).init))); 3973 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init))); 3974 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init))); 3975 3976 enum SEnum : char[5] { a = "hello" } 3977 enum WSEnum : wchar[5] { a = "hello"w } 3978 enum DSEnum : dchar[5] { a = "hello"d } 3979 3980 static assert(!__traits(compiles, byCodeUnit(SEnum.a))); 3981 static assert(!__traits(compiles, byCodeUnit(WSEnum.a))); 3982 static assert(!__traits(compiles, byCodeUnit(DSEnum.a))); 3983 } 3984 3985 /**************************** 3986 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 3987 * of characters by char, wchar, or dchar. 3988 * These aliases simply forward to $(LREF byUTF) with the 3989 * corresponding C argument. 3990 * 3991 * Params: 3992 * r = input range of characters, or array of characters 3993 */ 3994 alias byChar = byUTF!char; 3995 3996 /// Ditto 3997 alias byWchar = byUTF!wchar; 3998 3999 /// Ditto 4000 alias byDchar = byUTF!dchar; 4001 4002 @safe pure nothrow @nogc unittest 4003 { 4004 { 4005 char[5] s; 4006 int i; 4007 foreach (c; "hello".byChar.byChar()) 4008 { 4009 //writefln("[%d] '%c'", i, c); 4010 s[i++] = c; 4011 } 4012 assert(s == "hello"); 4013 } 4014 { 4015 char[5+2+3+4+3+3] s; 4016 int i; 4017 dchar[10] a; 4018 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4019 a[8] = 0xD800; // invalid 4020 a[9] = cast(dchar) 0x110000; // invalid 4021 foreach (c; a[].byChar()) 4022 { 4023 //writefln("[%d] '%c'", i, c); 4024 s[i++] = c; 4025 } 4026 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"); 4027 } 4028 { 4029 auto r = "hello"w.byChar(); 4030 r.popFront(); 4031 r.popFront(); 4032 assert(r.front == 'l'); 4033 } 4034 { 4035 auto r = "hello"d.byChar(); 4036 r.popFront(); 4037 r.popFront(); 4038 assert(r.front == 'l'); 4039 } 4040 { 4041 auto r = "hello"d.byChar(); 4042 assert(isForwardRange!(typeof(r))); 4043 auto s = r.save; 4044 r.popFront(); 4045 assert(s.front == 'h'); 4046 } 4047 } 4048 4049 @safe pure nothrow @nogc unittest 4050 { 4051 { 4052 wchar[11] s; 4053 int i; 4054 dchar[10] a; 4055 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4056 a[8] = 0xD800; // invalid 4057 a[9] = cast(dchar) 0x110000; // invalid 4058 foreach (c; a[].byWchar()) 4059 { 4060 //writefln("[%d] '%c' x%x", i, c, c); 4061 s[i++] = c; 4062 } 4063 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w) 4064 { 4065 //writefln("[%d] '%c' x%x", j, c, c); 4066 } 4067 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w); 4068 } 4069 4070 { 4071 auto r = "hello".byWchar(); 4072 r.popFront(); 4073 r.popFront(); 4074 assert(r.front == 'l'); 4075 } 4076 { 4077 auto r = "hello"d.byWchar(); 4078 r.popFront(); 4079 r.popFront(); 4080 assert(r.front == 'l'); 4081 } 4082 { 4083 auto r = "hello"d.byWchar(); 4084 assert(isForwardRange!(typeof(r))); 4085 auto s = r.save; 4086 r.popFront(); 4087 assert(s.front == 'h'); 4088 } 4089 } 4090 4091 @safe pure nothrow @nogc unittest 4092 { 4093 { 4094 dchar[9] s; 4095 int i; 4096 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences 4097 foreach (c; a.byDchar()) 4098 { 4099 s[i++] = c; 4100 } 4101 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d); 4102 } 4103 { 4104 foreach (s; invalidUTFstrings!char()) 4105 { 4106 auto r = s.byDchar(); 4107 assert(!r.empty); 4108 assert(r.front == r.front); 4109 dchar c = r.front; 4110 assert(c == replacementDchar); 4111 } 4112 } 4113 { 4114 auto r = "hello".byDchar(); 4115 r.popFront(); 4116 r.popFront(); 4117 assert(r.front == 'l'); 4118 } 4119 4120 { 4121 dchar[8] s; 4122 int i; 4123 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w; 4124 foreach (c; a.byDchar()) 4125 { 4126 //writefln("[%d] '%c' x%x", i, c, c); 4127 s[i++] = c; 4128 } 4129 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d); 4130 } 4131 { 4132 foreach (s; invalidUTFstrings!wchar()) 4133 { 4134 auto r = s.byDchar(); 4135 assert(!r.empty); 4136 assert(r.front == r.front); 4137 dchar c = r.front; 4138 assert(c == replacementDchar); 4139 } 4140 } 4141 { 4142 wchar[2] ws; 4143 ws[0] = 0xD800; 4144 ws[1] = 0xDD00; // correct surrogate pair 4145 auto r = ws[].byDchar(); 4146 assert(!r.empty); 4147 assert(r.front == r.front); 4148 dchar c = r.front; 4149 assert(c == '\U00010100'); 4150 } 4151 { 4152 auto r = "hello"w.byDchar(); 4153 r.popFront(); 4154 r.popFront(); 4155 assert(r.front == 'l'); 4156 } 4157 4158 { 4159 dchar[5] s; 4160 int i; 4161 dstring a = "hello"d; 4162 foreach (c; a.byDchar.byDchar()) 4163 { 4164 //writefln("[%d] '%c' x%x", i, c, c); 4165 s[i++] = c; 4166 } 4167 assert(s == "hello"d); 4168 } 4169 { 4170 auto r = "hello".byDchar(); 4171 assert(isForwardRange!(typeof(r))); 4172 auto s = r.save; 4173 r.popFront(); 4174 assert(s.front == 'h'); 4175 } 4176 { 4177 auto r = "hello"w.byDchar(); 4178 assert(isForwardRange!(typeof(r))); 4179 auto s = r.save; 4180 r.popFront(); 4181 assert(s.front == 'h'); 4182 } 4183 } 4184 4185 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar, 4186 // which needs to support ranges with and without those attributes 4187 4188 pure @safe nothrow @nogc unittest 4189 { 4190 dchar[5] s = "hello"d; 4191 foreach (c; s[].byChar()) { } 4192 foreach (c; s[].byWchar()) { } 4193 foreach (c; s[].byDchar()) { } 4194 } 4195 4196 version (StdUnittest) 4197 private int impureVariable; 4198 4199 @system unittest 4200 { 4201 static struct ImpureThrowingSystemRange(Char) 4202 { 4203 @property bool empty() const { return true; } 4204 @property Char front() const { return Char.init; } 4205 void popFront() 4206 { 4207 impureVariable++; 4208 throw new Exception("only for testing nothrow"); 4209 } 4210 } 4211 4212 foreach (Char; AliasSeq!(char, wchar, dchar)) 4213 { 4214 ImpureThrowingSystemRange!Char range; 4215 foreach (c; range.byChar()) { } 4216 foreach (c; range.byWchar()) { } 4217 foreach (c; range.byDchar()) { } 4218 } 4219 } 4220 4221 /**************************** 4222 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4223 * of characters by char type `C` by encoding the elements of the range. 4224 * 4225 * UTF sequences that cannot be converted to the specified encoding are either 4226 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution" 4227 * of the Unicode Standard 6.2 or result in a thrown UTFException. 4228 * Hence byUTF is not symmetric. 4229 * This algorithm is lazy, and does not allocate memory. 4230 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the 4231 * `r` parameter. 4232 * 4233 * Params: 4234 * C = `char`, `wchar`, or `dchar` 4235 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`, 4236 * UseReplacementDchar.no means throw `UTFException` for invalid UTF 4237 * 4238 * Throws: 4239 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes` 4240 * 4241 * GC: 4242 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no` 4243 * 4244 * Returns: 4245 * A forward range if `R` is a range and not auto-decodable, as defined by 4246 * $(REF isAutodecodableString, std, traits), and if the base range is 4247 * also a forward range. 4248 * 4249 * Or, if `R` is a range and it is auto-decodable and 4250 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed 4251 * to $(LREF byCodeUnit). 4252 * 4253 * Otherwise, an input range of characters. 4254 */ 4255 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar) 4256 if (isSomeChar!C) 4257 { 4258 static if (!is(Unqual!C == C)) 4259 alias byUTF = byUTF!(Unqual!C); 4260 else: 4261 4262 auto ref byUTF(R)(R r) 4263 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4264 { 4265 return byUTF(r.byCodeUnit()); 4266 } 4267 4268 auto ref byUTF(R)(R r) 4269 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4270 { 4271 alias RC = Unqual!(ElementEncodingType!R); 4272 4273 static if (is(RC == C)) 4274 { 4275 return r.byCodeUnit(); 4276 } 4277 else static if (is(C == dchar)) 4278 { 4279 static struct Result 4280 { 4281 enum Empty = uint.max; // range is empty or just constructed 4282 4283 this(return R r) 4284 { 4285 this.r = r; 4286 } 4287 4288 this(return R r, uint buff) 4289 { 4290 this.r = r; 4291 this.buff = buff; 4292 } 4293 4294 4295 @property bool empty() 4296 { 4297 return buff == Empty && r.empty; 4298 } 4299 4300 @property dchar front() scope // 'scope' required by call to decodeFront() below 4301 { 4302 if (buff == Empty) 4303 { 4304 auto c = r.front; 4305 4306 static if (is(RC == wchar)) 4307 enum firstMulti = 0xD800; // First high surrogate. 4308 else 4309 enum firstMulti = 0x80; // First non-ASCII. 4310 if (c < firstMulti) 4311 { 4312 r.popFront; 4313 buff = cast(dchar) c; 4314 } 4315 else 4316 { 4317 buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4318 } 4319 } 4320 return cast(dchar) buff; 4321 } 4322 4323 void popFront() 4324 { 4325 if (buff == Empty) 4326 front(); 4327 buff = Empty; 4328 } 4329 4330 static if (isForwardRange!R) 4331 { 4332 @property auto save() 4333 { 4334 return Result(r.save, buff); 4335 } 4336 } 4337 4338 private: 4339 4340 R r; 4341 uint buff = Empty; // one character lookahead buffer 4342 } 4343 4344 return Result(r); 4345 } 4346 else 4347 { 4348 static struct Result 4349 { 4350 this(return R r) 4351 { 4352 this.r = r; 4353 } 4354 4355 this(return R r, ushort pos, ushort fill, C[4 / C.sizeof] buf) 4356 { 4357 this.r = r; 4358 this.pos = pos; 4359 this.fill = fill; 4360 this.buf = buf; 4361 } 4362 4363 @property bool empty() 4364 { 4365 return pos == fill && r.empty; 4366 } 4367 4368 @property auto front() scope // 'scope' required by call to decodeFront() below 4369 { 4370 if (pos == fill) 4371 { 4372 pos = 0; 4373 auto c = r.front; 4374 4375 static if (C.sizeof >= 2 && RC.sizeof >= 2) 4376 enum firstMulti = 0xD800; // First high surrogate. 4377 else 4378 enum firstMulti = 0x80; // First non-ASCII. 4379 if (c < firstMulti) 4380 { 4381 fill = 1; 4382 r.popFront; 4383 buf[pos] = cast(C) c; 4384 } 4385 else 4386 { 4387 static if (is(RC == dchar)) 4388 { 4389 r.popFront; 4390 dchar dc = c; 4391 } 4392 else 4393 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4394 fill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4395 } 4396 } 4397 return buf[pos]; 4398 } 4399 4400 void popFront() 4401 { 4402 if (pos == fill) 4403 front; 4404 ++pos; 4405 } 4406 4407 static if (isForwardRange!R) 4408 { 4409 @property auto save() 4410 { 4411 return Result(r.save, pos, fill, buf); 4412 } 4413 } 4414 4415 private: 4416 4417 R r; 4418 ushort pos, fill; 4419 C[4 / C.sizeof] buf = void; 4420 } 4421 4422 return Result(r); 4423 } 4424 } 4425 } 4426 4427 /// 4428 @safe pure nothrow unittest 4429 { 4430 import std.algorithm.comparison : equal; 4431 4432 // hellö as a range of `char`s, which are UTF-8 4433 assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6])); 4434 4435 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit) 4436 assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö'])); 4437 4438 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32 4439 assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7])); 4440 assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37])); 4441 assert("𐐷".byUTF!dchar().equal([0x00010437])); 4442 } 4443 4444 /// 4445 @safe unittest 4446 { 4447 import std.algorithm.comparison : equal; 4448 import std.exception : assertThrown; 4449 4450 assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty")); 4451 assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty")); 4452 }