std.utf source code

1 // Written in the D programming language.
2 
3 /++
4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
5 
6     UTF character support is restricted to
7     $(D '\u0000' &lt;= character &lt;= '\U0010FFFF').
8 
9 $(SCRIPT inhibitQuickIndex = 1;)
10 $(DIVC quickindex,
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14     $(LREF decode)
15     $(LREF decodeFront)
16 ))
17 $(TR $(TD Lazy decode) $(TD
18     $(LREF byCodeUnit)
19     $(LREF byChar)
20     $(LREF byWchar)
21     $(LREF byDchar)
22     $(LREF byUTF)
23 ))
24 $(TR $(TD Encode) $(TD
25     $(LREF encode)
26     $(LREF toUTF8)
27     $(LREF toUTF16)
28     $(LREF toUTF32)
29     $(LREF toUTFz)
30     $(LREF toUTF16z)
31 ))
32 $(TR $(TD Length) $(TD
33     $(LREF codeLength)
34     $(LREF count)
35     $(LREF stride)
36     $(LREF strideBack)
37 ))
38 $(TR $(TD Index) $(TD
39     $(LREF toUCSindex)
40     $(LREF toUTFindex)
41 ))
42 $(TR $(TD Validation) $(TD
43     $(LREF isValidDchar)
44     $(LREF validate)
45 ))
46 $(TR $(TD Miscellaneous) $(TD
47     $(LREF replacementDchar)
48     $(LREF UseReplacementDchar)
49     $(LREF UTFException)
50 ))
51 ))
52     See_Also:
53         $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
54         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
55         $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
56     Copyright: Copyright The D Language Foundation 2000 - 2012.
57     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
58     Authors:   $(HTTP digitalmars.com, Walter Bright) and
59                $(HTTP jmdavisprog.com, Jonathan M Davis)
60     Source:    $(PHOBOSSRC std/utf.d)
61    +/
62 module std.utf;
63 
64 import std.exception : basicExceptionCtors;
65 import core.exception : UnicodeException;
66 import std.meta : AliasSeq;
67 import std.range.primitives;
68 import std.traits : isAutodecodableString, isPointer, isSomeChar,
69     isSomeString, isStaticArray, Unqual, isConvertibleToString;
70 import std.typecons : Flag, Yes, No;
71 
72 
73 /++
74     Exception thrown on errors in std.utf functions.
75   +/
76 class UTFException : UnicodeException
77 {
78     import core.internal..string : unsignedToTempString, UnsignedStringBuf;
79 
80     uint[4] sequence;
81     size_t  len;
82 
83     @safe pure nothrow @nogc
84     UTFException setSequence(scope uint[] data...)
85     {
86         assert(data.length <= 4);
87 
88         len = data.length < 4 ? data.length : 4;
89         sequence[0 .. len] = data[0 .. len];
90 
91         return this;
92     }
93 
94     // FIXME: Use std.exception.basicExceptionCtors here once
95     // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
96 
97     /**
98     Standard exception constructors.
99      */
100     this(string msg, string file = __FILE__, size_t line = __LINE__,
101          Throwable next = null) @nogc @safe pure nothrow
102     {
103         super(msg, 0, file, line, next);
104     }
105     /// ditto
106     this(string msg, size_t index, string file = __FILE__,
107          size_t line = __LINE__, Throwable next = null) @safe pure nothrow
108     {
109         UnsignedStringBuf buf = void;
110         msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
111         super(msg, index, file, line, next);
112     }
113 
114     /**
115     Returns:
116         A `string` detailing the invalid UTF sequence.
117      */
118     override string toString() const
119     {
120         if (len == 0)
121         {
122             /* Exception.toString() is not marked as const, although
123              * it is const-compatible.
124              */
125             //return super.toString();
126             auto e = () @trusted { return cast(Exception) super; } ();
127             return e.toString();
128         }
129 
130         string result = "Invalid UTF sequence:";
131 
132         foreach (i; sequence[0 .. len])
133         {
134             UnsignedStringBuf buf = void;
135             result ~= ' ';
136             auto h = unsignedToTempString!16(i, buf);
137             if (h.length == 1)
138                 result ~= '0';
139             result ~= h;
140             result ~= 'x';
141         }
142 
143         if (super.msg.length > 0)
144         {
145             result ~= " - ";
146             result ~= super.msg;
147         }
148 
149         return result;
150     }
151 }
152 
153 ///
154 @safe unittest
155 {
156     import std.exception : assertThrown;
157 
158     char[4] buf;
159     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
160     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
161     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
162     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
163     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
164 }
165 
166 /*
167    Provide array of invalidly encoded UTF strings. Useful for testing.
168 
169    Params:
170         Char = char, wchar, or dchar
171 
172    Returns:
173         an array of invalidly encoded UTF strings
174  */
175 
176 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
177 if (isSomeChar!Char)
178 {
179     static if (is(Char == char))
180     {
181         enum x = 0xDC00;         // invalid surrogate value
182         enum y = 0x110000;       // out of range
183 
184         static immutable string[8] result =
185         [
186             "\x80",             // not a start byte
187             "\xC0",             // truncated
188             "\xC0\xC0",         // invalid continuation
189             "\xF0\x82\x82\xAC", // overlong
190             [
191               0xE0 | (x >> 12),
192               0x80 | ((x >> 6) & 0x3F),
193               0x80 | (x & 0x3F)
194             ],
195             [
196               cast(char)(0xF0 | (y >> 18)),
197               cast(char)(0x80 | ((y >> 12) & 0x3F)),
198               cast(char)(0x80 | ((y >> 6) & 0x3F)),
199               cast(char)(0x80 | (y & 0x3F))
200             ],
201             [
202               cast(char)(0xF8 | 3),     // 5 byte encoding
203               cast(char)(0x80 | 3),
204               cast(char)(0x80 | 3),
205               cast(char)(0x80 | 3),
206               cast(char)(0x80 | 3),
207             ],
208             [
209               cast(char)(0xFC | 3),     // 6 byte encoding
210               cast(char)(0x80 | 3),
211               cast(char)(0x80 | 3),
212               cast(char)(0x80 | 3),
213               cast(char)(0x80 | 3),
214               cast(char)(0x80 | 3),
215             ],
216         ];
217 
218         return result[];
219     }
220     else static if (is(Char == wchar))
221     {
222         static immutable wstring[5] result =
223         [
224             [
225               cast(wchar) 0xDC00,
226             ],
227             [
228               cast(wchar) 0xDFFF,
229             ],
230             [
231               cast(wchar) 0xDBFF,
232               cast(wchar) 0xDBFF,
233             ],
234             [
235               cast(wchar) 0xDBFF,
236               cast(wchar) 0xE000,
237             ],
238             [
239               cast(wchar) 0xD800,
240             ],
241         ];
242 
243         return result[];
244     }
245     else static if (is(Char == dchar))
246     {
247         static immutable dstring[3] result =
248         [
249             [ cast(dchar) 0x110000 ],
250             [ cast(dchar) 0x00D800 ],
251             [ cast(dchar) 0x00DFFF ],
252         ];
253 
254         return result;
255     }
256     else
257         static assert(0);
258 }
259 
260 /++
261     Check whether the given Unicode code point is valid.
262 
263     Params:
264         c = code point to check
265 
266     Returns:
267         `true` if and only if `c` is a valid Unicode code point
268 
269     Note:
270     `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
271     as they are permitted for internal use by an application, but they are
272     not allowed for interchange by the Unicode standard.
273   +/
274 bool isValidDchar(dchar c) pure nothrow @safe @nogc
275 {
276     return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
277 }
278 
279 ///
280 @safe @nogc pure nothrow unittest
281 {
282     assert( isValidDchar(cast(dchar) 0x41));
283     assert( isValidDchar(cast(dchar) 0x00));
284     assert(!isValidDchar(cast(dchar) 0xD800));
285     assert(!isValidDchar(cast(dchar) 0x11FFFF));
286 }
287 
288 pure nothrow @safe @nogc unittest
289 {
290     import std.exception;
291 
292     assertCTFEable!(
293     {
294     assert( isValidDchar(cast(dchar)'a') == true);
295     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
296 
297     assert(!isValidDchar(cast(dchar) 0x00D800));
298     assert(!isValidDchar(cast(dchar) 0x00DBFF));
299     assert(!isValidDchar(cast(dchar) 0x00DC00));
300     assert(!isValidDchar(cast(dchar) 0x00DFFF));
301     assert( isValidDchar(cast(dchar) 0x00FFFE));
302     assert( isValidDchar(cast(dchar) 0x00FFFF));
303     assert( isValidDchar(cast(dchar) 0x01FFFF));
304     assert( isValidDchar(cast(dchar) 0x10FFFF));
305     assert(!isValidDchar(cast(dchar) 0x110000));
306     });
307 }
308 
309 
310 /++
311     Calculate the length of the UTF sequence starting at `index`
312     in `str`.
313 
314     Params:
315         str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
316         of UTF code units. Must be random access if `index` is passed
317         index = starting index of UTF sequence (default: `0`)
318 
319     Returns:
320         The number of code units in the UTF sequence. For UTF-8, this is a
321         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
322         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
323 
324     Throws:
325         May throw a `UTFException` if `str[index]` is not the start of a
326         valid UTF sequence.
327 
328     Note:
329         `stride` will only analyze the first `str[index]` element. It
330         will not fully verify the validity of the UTF sequence, nor even verify
331         the presence of the sequence: it will not actually guarantee that
332         $(D index + stride(str, index) <= str.length).
333   +/
334 uint stride(S)(auto ref S str, size_t index)
335 if (is(S : const char[]) ||
336     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
337 {
338     static if (is(typeof(str.length) : ulong))
339         assert(index < str.length, "Past the end of the UTF-8 sequence");
340     immutable c = str[index];
341 
342     if (c < 0x80)
343         return 1;
344     else
345         return strideImpl(c, index);
346 }
347 
348 /// Ditto
349 uint stride(S)(auto ref S str)
350 if (is(S : const char[]) ||
351     (isInputRange!S && is(immutable ElementType!S == immutable char)))
352 {
353     static if (is(S : const char[]))
354         immutable c = str[0];
355     else
356         immutable c = str.front;
357 
358     if (c < 0x80)
359         return 1;
360     else
361         return strideImpl(c, 0);
362 }
363 
364 @system unittest
365 {
366     import core.exception : AssertError;
367     import std.conv : to;
368     import std.exception;
369     import std..string : format;
370     import std.traits : FunctionAttribute, functionAttributes, isSafe;
371     static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
372     {
373         enforce(stride(s, i) == codeLength!char(c),
374                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
375 
376         enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
377                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
378 
379         auto refRandom = new RefRandomCU!char(s);
380         immutable randLen = refRandom.length;
381         enforce(stride(refRandom, i) == codeLength!char(c),
382                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
383         enforce(refRandom.length == randLen,
384                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
385 
386         if (i == 0)
387         {
388             enforce(stride(s) == codeLength!char(c),
389                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
390 
391             enforce(stride(InputCU!char(s)) == codeLength!char(c),
392                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
393 
394             auto refBidir = new RefBidirCU!char(s);
395             immutable bidirLen = refBidir.length;
396             enforce(stride(refBidir) == codeLength!char(c),
397                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
398             enforce(refBidir.length == bidirLen,
399                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
400         }
401     }
402 
403     assertCTFEable!(
404     {
405     test("a", 'a');
406     test(" ", ' ');
407     test("\u2029", '\u2029'); //paraSep
408     test("\u0100", '\u0100');
409     test("\u0430", '\u0430');
410     test("\U00010143", '\U00010143');
411     test("abcdefcdef", 'a');
412     test("hello\U00010143\u0100\U00010143", 'h', 0);
413     test("hello\U00010143\u0100\U00010143", 'e', 1);
414     test("hello\U00010143\u0100\U00010143", 'l', 2);
415     test("hello\U00010143\u0100\U00010143", 'l', 3);
416     test("hello\U00010143\u0100\U00010143", 'o', 4);
417     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
418     test("hello\U00010143\u0100\U00010143", '\u0100', 9);
419     test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
420 
421     foreach (S; AliasSeq!(char[], const char[], string))
422     {
423         enum str = to!S("hello world");
424         static assert(isSafe!({ stride(str, 0); }));
425         static assert(isSafe!({ stride(str);    }));
426         static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
427         static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
428     }
429     });
430 }
431 
432 @safe unittest // invalid start bytes
433 {
434     import std.exception : assertThrown;
435     immutable char[] invalidStartBytes = [
436         0b1111_1000, // indicating a sequence length of 5
437         0b1111_1100, // 6
438         0b1111_1110, // 7
439         0b1111_1111, // 8
440         0b1000_0000, // continuation byte
441     ];
442     foreach (c; invalidStartBytes)
443         assertThrown!UTFException(stride([c]));
444 }
445 
446 /// Ditto
447 uint stride(S)(auto ref S str, size_t index)
448 if (is(S : const wchar[]) ||
449     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
450 {
451     static if (is(typeof(str.length) : ulong))
452         assert(index < str.length, "Past the end of the UTF-16 sequence");
453     immutable uint u = str[index];
454     return 1 + (u >= 0xD800 && u <= 0xDBFF);
455 }
456 
457 /// Ditto
458 uint stride(S)(auto ref S str) @safe pure
459 if (is(S : const wchar[]))
460 {
461     return stride(str, 0);
462 }
463 
464 /// Ditto
465 uint stride(S)(auto ref S str)
466 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
467     !is(S : const wchar[]))
468 {
469     assert(!str.empty, "UTF-16 sequence is empty");
470     immutable uint u = str.front;
471     return 1 + (u >= 0xD800 && u <= 0xDBFF);
472 }
473 
474 @system unittest
475 {
476     import core.exception : AssertError;
477     import std.conv : to;
478     import std.exception;
479     import std..string : format;
480     import std.traits : FunctionAttribute, functionAttributes, isSafe;
481     static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
482     {
483         enforce(stride(s, i) == codeLength!wchar(c),
484                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
485 
486         enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
487                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
488 
489         auto refRandom = new RefRandomCU!wchar(s);
490         immutable randLen = refRandom.length;
491         enforce(stride(refRandom, i) == codeLength!wchar(c),
492                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
493         enforce(refRandom.length == randLen,
494                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
495 
496         if (i == 0)
497         {
498             enforce(stride(s) == codeLength!wchar(c),
499                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
500 
501             enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
502                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
503 
504             auto refBidir = new RefBidirCU!wchar(s);
505             immutable bidirLen = refBidir.length;
506             enforce(stride(refBidir) == codeLength!wchar(c),
507                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
508             enforce(refBidir.length == bidirLen,
509                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
510         }
511     }
512 
513     assertCTFEable!(
514     {
515     test("a", 'a');
516     test(" ", ' ');
517     test("\u2029", '\u2029'); //paraSep
518     test("\u0100", '\u0100');
519     test("\u0430", '\u0430');
520     test("\U00010143", '\U00010143');
521     test("abcdefcdef", 'a');
522     test("hello\U00010143\u0100\U00010143", 'h', 0);
523     test("hello\U00010143\u0100\U00010143", 'e', 1);
524     test("hello\U00010143\u0100\U00010143", 'l', 2);
525     test("hello\U00010143\u0100\U00010143", 'l', 3);
526     test("hello\U00010143\u0100\U00010143", 'o', 4);
527     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
528     test("hello\U00010143\u0100\U00010143", '\u0100', 7);
529     test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
530 
531     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
532     {
533         enum str = to!S("hello world");
534         static assert(isSafe!(() => stride(str, 0)));
535         static assert(isSafe!(() => stride(str)   ));
536         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
537         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
538     }
539     });
540 }
541 
542 /// Ditto
543 uint stride(S)(auto ref S str, size_t index = 0)
544 if (is(S : const dchar[]) ||
545     (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
546 {
547     static if (is(typeof(str.length) : ulong))
548         assert(index < str.length, "Past the end of the UTF-32 sequence");
549     else
550         assert(!str.empty, "UTF-32 sequence is empty.");
551     return 1;
552 }
553 
554 ///
555 @safe unittest
556 {
557     assert("a".stride == 1);
558     assert("λ".stride == 2);
559     assert("aλ".stride == 1);
560     assert("aλ".stride(1) == 2);
561     assert("𐐷".stride == 4);
562 }
563 
564 @system unittest
565 {
566     import core.exception : AssertError;
567     import std.conv : to;
568     import std.exception;
569     import std..string : format;
570     import std.traits : FunctionAttribute, functionAttributes, isSafe;
571     static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
572     {
573         enforce(stride(s, i) == codeLength!dchar(c),
574                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
575 
576         enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
577                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
578 
579         auto refRandom = new RefRandomCU!dchar(s);
580         immutable randLen = refRandom.length;
581         enforce(stride(refRandom, i) == codeLength!dchar(c),
582                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
583         enforce(refRandom.length == randLen,
584                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
585 
586         if (i == 0)
587         {
588             enforce(stride(s) == codeLength!dchar(c),
589                     new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
590 
591             enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
592                     new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
593 
594             auto refBidir = new RefBidirCU!dchar(s);
595             immutable bidirLen = refBidir.length;
596             enforce(stride(refBidir) == codeLength!dchar(c),
597                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
598             enforce(refBidir.length == bidirLen,
599                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
600         }
601     }
602 
603     assertCTFEable!(
604     {
605     test("a", 'a');
606     test(" ", ' ');
607     test("\u2029", '\u2029'); //paraSep
608     test("\u0100", '\u0100');
609     test("\u0430", '\u0430');
610     test("\U00010143", '\U00010143');
611     test("abcdefcdef", 'a');
612     test("hello\U00010143\u0100\U00010143", 'h', 0);
613     test("hello\U00010143\u0100\U00010143", 'e', 1);
614     test("hello\U00010143\u0100\U00010143", 'l', 2);
615     test("hello\U00010143\u0100\U00010143", 'l', 3);
616     test("hello\U00010143\u0100\U00010143", 'o', 4);
617     test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
618     test("hello\U00010143\u0100\U00010143", '\u0100', 6);
619     test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
620 
621     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
622     {
623         enum str = to!S("hello world");
624         static assert(isSafe!(() => stride(str, 0)));
625         static assert(isSafe!(() => stride(str)   ));
626         static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
627         static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
628     }
629     });
630 }
631 
632 private uint strideImpl(char c, size_t index) @trusted pure
633 in { assert(c & 0x80); }
634 do
635 {
636     import core.bitop : bsr;
637     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
638     if (c == 0xFF || msbs < 2 || msbs > 4)
639         throw new UTFException("Invalid UTF-8 sequence", index);
640     return msbs;
641 }
642 
643 /++
644     Calculate the length of the UTF sequence ending one code unit before
645     `index` in `str`.
646 
647     Params:
648         str = bidirectional range of UTF code units. Must be random access if
649         `index` is passed
650         index = index one past end of UTF sequence (default: `str.length`)
651 
652     Returns:
653         The number of code units in the UTF sequence. For UTF-8, this is a
654         value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
655         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
656 
657     Throws:
658         May throw a `UTFException` if `str[index]` is not one past the
659         end of a valid UTF sequence.
660 
661     Note:
662         `strideBack` will only analyze the element at $(D str[index - 1])
663         element. It will not fully verify the validity of the UTF sequence, nor
664         even verify the presence of the sequence: it will not actually
665         guarantee that $(D strideBack(str, index) <= index).
666   +/
667 uint strideBack(S)(auto ref S str, size_t index)
668 if (is(S : const char[]) ||
669     (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
670 {
671     static if (is(typeof(str.length) : ulong))
672         assert(index <= str.length, "Past the end of the UTF-8 sequence");
673     assert(index > 0, "Not the end of the UTF-8 sequence");
674 
675     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
676         return 1;
677 
678     if (index >= 4) //single verification for most common case
679     {
680         static foreach (i; 2 .. 5)
681         {
682             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
683                 return i;
684         }
685     }
686     else
687     {
688         static foreach (i; 2 .. 4)
689         {
690             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
691                 return i;
692         }
693     }
694     throw new UTFException("Not the end of the UTF sequence", index);
695 }
696 
697 /// Ditto
698 uint strideBack(S)(auto ref S str)
699 if (is(S : const char[]) ||
700     (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
701 {
702     return strideBack(str, str.length);
703 }
704 
705 /// Ditto
706 uint strideBack(S)(auto ref S str)
707 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
708 {
709     assert(!str.empty, "Past the end of the UTF-8 sequence");
710     auto temp = str.save;
711     foreach (i; AliasSeq!(1, 2, 3, 4))
712     {
713         if ((temp.back & 0b1100_0000) != 0b1000_0000)
714             return i;
715         temp.popBack();
716         if (temp.empty)
717             break;
718     }
719     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
720 }
721 
722 @system unittest
723 {
724     import core.exception : AssertError;
725     import std.conv : to;
726     import std.exception;
727     import std..string : format;
728     import std.traits : FunctionAttribute, functionAttributes, isSafe;
729     static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
730     {
731         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
732                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
733 
734         enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
735                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
736 
737         auto refRandom = new RefRandomCU!char(s);
738         immutable randLen = refRandom.length;
739         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
740                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
741         enforce(refRandom.length == randLen,
742                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
743 
744         if (i == size_t.max)
745         {
746             enforce(strideBack(s) == codeLength!char(c),
747                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
748 
749             enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
750                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
751 
752             auto refBidir = new RefBidirCU!char(s);
753             immutable bidirLen = refBidir.length;
754             enforce(strideBack(refBidir) == codeLength!char(c),
755                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
756             enforce(refBidir.length == bidirLen,
757                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
758         }
759     }
760 
761     assertCTFEable!(
762     {
763     test("a", 'a');
764     test(" ", ' ');
765     test("\u2029", '\u2029'); //paraSep
766     test("\u0100", '\u0100');
767     test("\u0430", '\u0430');
768     test("\U00010143", '\U00010143');
769     test("abcdefcdef", 'f');
770     test("\U00010143\u0100\U00010143hello", 'o', 15);
771     test("\U00010143\u0100\U00010143hello", 'l', 14);
772     test("\U00010143\u0100\U00010143hello", 'l', 13);
773     test("\U00010143\u0100\U00010143hello", 'e', 12);
774     test("\U00010143\u0100\U00010143hello", 'h', 11);
775     test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
776     test("\U00010143\u0100\U00010143hello", '\u0100', 6);
777     test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
778 
779     foreach (S; AliasSeq!(char[], const char[], string))
780     {
781         enum str = to!S("hello world");
782         static assert(isSafe!({ strideBack(str, 0); }));
783         static assert(isSafe!({ strideBack(str);    }));
784         static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
785         static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
786     }
787     });
788 }
789 
790 //UTF-16 is self synchronizing: The length of strideBack can be found from
791 //the value of a single wchar
792 /// Ditto
793 uint strideBack(S)(auto ref S str, size_t index)
794 if (is(S : const wchar[]) ||
795     (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
796 {
797     static if (is(typeof(str.length) : ulong))
798         assert(index <= str.length, "Past the end of the UTF-16 sequence");
799     assert(index > 0, "Not the end of a UTF-16 sequence");
800 
801     immutable c2 = str[index-1];
802     return 1 + (0xDC00 <= c2 && c2 < 0xE000);
803 }
804 
805 /// Ditto
806 uint strideBack(S)(auto ref S str)
807 if (is(S : const wchar[]) ||
808     (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
809 {
810     assert(!str.empty, "UTF-16 sequence is empty");
811 
812     static if (is(S : const(wchar)[]))
813         immutable c2 = str[$ - 1];
814     else
815         immutable c2 = str.back;
816 
817     return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
818 }
819 
820 @system unittest
821 {
822     import core.exception : AssertError;
823     import std.conv : to;
824     import std.exception;
825     import std..string : format;
826     import std.traits : FunctionAttribute, functionAttributes, isSafe;
827     static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
828     {
829         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
830                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
831 
832         enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
833                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
834 
835         auto refRandom = new RefRandomCU!wchar(s);
836         immutable randLen = refRandom.length;
837         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
838                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
839         enforce(refRandom.length == randLen,
840                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
841 
842         if (i == size_t.max)
843         {
844             enforce(strideBack(s) == codeLength!wchar(c),
845                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
846 
847             enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
848                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
849 
850             auto refBidir = new RefBidirCU!wchar(s);
851             immutable bidirLen = refBidir.length;
852             enforce(strideBack(refBidir) == codeLength!wchar(c),
853                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
854             enforce(refBidir.length == bidirLen,
855                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
856         }
857     }
858 
859     assertCTFEable!(
860     {
861     test("a", 'a');
862     test(" ", ' ');
863     test("\u2029", '\u2029'); //paraSep
864     test("\u0100", '\u0100');
865     test("\u0430", '\u0430');
866     test("\U00010143", '\U00010143');
867     test("abcdefcdef", 'f');
868     test("\U00010143\u0100\U00010143hello", 'o', 10);
869     test("\U00010143\u0100\U00010143hello", 'l', 9);
870     test("\U00010143\u0100\U00010143hello", 'l', 8);
871     test("\U00010143\u0100\U00010143hello", 'e', 7);
872     test("\U00010143\u0100\U00010143hello", 'h', 6);
873     test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
874     test("\U00010143\u0100\U00010143hello", '\u0100', 3);
875     test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
876 
877     foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
878     {
879         enum str = to!S("hello world");
880         static assert(isSafe!(() => strideBack(str, 0)));
881         static assert(isSafe!(() => strideBack(str)   ));
882         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
883         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
884     }
885     });
886 }
887 
888 /// Ditto
889 uint strideBack(S)(auto ref S str, size_t index)
890 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
891 {
892     static if (is(typeof(str.length) : ulong))
893         assert(index <= str.length, "Past the end of the UTF-32 sequence");
894     assert(index > 0, "Not the end of the UTF-32 sequence");
895     return 1;
896 }
897 
898 /// Ditto
899 uint strideBack(S)(auto ref S str)
900 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
901 {
902     assert(!str.empty, "Empty UTF-32 sequence");
903     return 1;
904 }
905 
906 ///
907 @safe unittest
908 {
909     assert("a".strideBack == 1);
910     assert("λ".strideBack == 2);
911     assert("aλ".strideBack == 2);
912     assert("aλ".strideBack(1) == 1);
913     assert("𐐷".strideBack == 4);
914 }
915 
916 @system unittest
917 {
918     import core.exception : AssertError;
919     import std.conv : to;
920     import std.exception;
921     import std..string : format;
922     import std.traits : FunctionAttribute, functionAttributes, isSafe;
923     static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
924     {
925         enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
926                 new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
927 
928         enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
929                 new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
930 
931         auto refRandom = new RefRandomCU!dchar(s);
932         immutable randLen = refRandom.length;
933         enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
934                 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
935         enforce(refRandom.length == randLen,
936                 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
937 
938         if (i == size_t.max)
939         {
940             enforce(strideBack(s) == codeLength!dchar(c),
941                     new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
942 
943             enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
944                     new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
945 
946             auto refBidir = new RefBidirCU!dchar(s);
947             immutable bidirLen = refBidir.length;
948             enforce(strideBack(refBidir) == codeLength!dchar(c),
949                     new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
950             enforce(refBidir.length == bidirLen,
951                     new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
952         }
953     }
954 
955     assertCTFEable!(
956     {
957     test("a", 'a');
958     test(" ", ' ');
959     test("\u2029", '\u2029'); //paraSep
960     test("\u0100", '\u0100');
961     test("\u0430", '\u0430');
962     test("\U00010143", '\U00010143');
963     test("abcdefcdef", 'f');
964     test("\U00010143\u0100\U00010143hello", 'o', 8);
965     test("\U00010143\u0100\U00010143hello", 'l', 7);
966     test("\U00010143\u0100\U00010143hello", 'l', 6);
967     test("\U00010143\u0100\U00010143hello", 'e', 5);
968     test("\U00010143\u0100\U00010143hello", 'h', 4);
969     test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
970     test("\U00010143\u0100\U00010143hello", '\u0100', 2);
971     test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
972 
973     foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
974     {
975         enum str = to!S("hello world");
976         static assert(isSafe!(() => strideBack(str, 0)));
977         static assert(isSafe!(() => strideBack(str)   ));
978         static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
979         static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
980     }
981     });
982 }
983 
984 
985 /++
986     Given `index` into `str` and assuming that `index` is at the start
987     of a UTF sequence, `toUCSindex` determines the number of UCS characters
988     up to `index`. So, `index` is the index of a code unit at the
989     beginning of a code point, and the return value is how many code points into
990     the string that that code point is.
991   +/
992 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
993 if (isSomeChar!C)
994 {
995     static if (is(immutable C == immutable dchar))
996         return index;
997     else
998     {
999         size_t n = 0;
1000         size_t j = 0;
1001 
1002         for (; j < index; ++n)
1003             j += stride(str, j);
1004 
1005         if (j > index)
1006         {
1007             static if (is(immutable C == immutable char))
1008                 throw new UTFException("Invalid UTF-8 sequence", index);
1009             else
1010                 throw new UTFException("Invalid UTF-16 sequence", index);
1011         }
1012 
1013         return n;
1014     }
1015 }
1016 
1017 ///
1018 @safe unittest
1019 {
1020     assert(toUCSindex(`hello world`, 7) == 7);
1021     assert(toUCSindex(`hello world`w, 7) == 7);
1022     assert(toUCSindex(`hello world`d, 7) == 7);
1023 
1024     assert(toUCSindex(`Ma Chérie`, 7) == 6);
1025     assert(toUCSindex(`Ma Chérie`w, 7) == 7);
1026     assert(toUCSindex(`Ma Chérie`d, 7) == 7);
1027 
1028     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
1029     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1030     assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1031 }
1032 
1033 
1034 /++
1035     Given a UCS index `n` into `str`, returns the UTF index.
1036     So, `n` is how many code points into the string the code point is, and
1037     the array index of the code unit is returned.
1038   +/
1039 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
1040 if (isSomeChar!C)
1041 {
1042     static if (is(immutable C == immutable dchar))
1043     {
1044         return n;
1045     }
1046     else
1047     {
1048         size_t i;
1049         while (n--)
1050         {
1051             i += stride(str, i);
1052         }
1053         return i;
1054     }
1055 }
1056 
1057 ///
1058 @safe unittest
1059 {
1060     assert(toUTFindex(`hello world`, 7) == 7);
1061     assert(toUTFindex(`hello world`w, 7) == 7);
1062     assert(toUTFindex(`hello world`d, 7) == 7);
1063 
1064     assert(toUTFindex(`Ma Chérie`, 6) == 7);
1065     assert(toUTFindex(`Ma Chérie`w, 7) == 7);
1066     assert(toUTFindex(`Ma Chérie`d, 7) == 7);
1067 
1068     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
1069     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
1070     assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
1071 }
1072 
1073 
1074 /* =================== Decode ======================= */
1075 
1076 /// Whether or not to replace invalid UTF with $(LREF replacementDchar)
1077 alias UseReplacementDchar = Flag!"useReplacementDchar";
1078 
1079 // Reduce distinct instantiations of decodeImpl.
1080 private template TypeForDecode(T)
1081 {
1082     import std.traits : isDynamicArray;
1083     static if (isDynamicArray!T && is(T : E[], E) && __traits(isArithmetic, E) && !is(E == shared))
1084         alias TypeForDecode = const(Unqual!E)[];
1085     else
1086         alias TypeForDecode = T;
1087 }
1088 
1089 /++
1090     Decodes and returns the code point starting at `str[index]`. `index`
1091     is advanced to one past the decoded code point. If the code point is not
1092     well-formed, then a `UTFException` is thrown and `index` remains
1093     unchanged.
1094 
1095     decode will only work with strings and random access ranges of code units
1096     with length and slicing, whereas $(LREF decodeFront) will work with any
1097     input range of code units.
1098 
1099     Params:
1100         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1101         str = input string or indexable Range
1102         index = starting index into s[]; incremented by number of code units processed
1103 
1104     Returns:
1105         decoded character
1106 
1107     Throws:
1108         $(LREF UTFException) if `str[index]` is not the start of a valid UTF
1109         sequence and useReplacementDchar is `No.useReplacementDchar`
1110   +/
1111 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
1112 if (!isSomeString!S &&
1113     isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
1114 in
1115 {
1116     assert(index < str.length, "Attempted to decode past the end of a string");
1117 }
1118 out (result)
1119 {
1120     assert(isValidDchar(result));
1121 }
1122 do
1123 {
1124     if (str[index] < codeUnitLimit!S)
1125         return str[index++];
1126     else
1127         return decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index);
1128 }
1129 
1130 /// ditto
1131 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1132 auto ref S str, ref size_t index) @trusted pure
1133 if (isSomeString!S)
1134 in
1135 {
1136     assert(index < str.length, "Attempted to decode past the end of a string");
1137 }
1138 out (result)
1139 {
1140     assert(isValidDchar(result));
1141 }
1142 do
1143 {
1144     if (str[index] < codeUnitLimit!S)
1145         return str[index++];
1146     else
1147         return decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index);
1148 }
1149 
1150 ///
1151 @safe pure unittest
1152 {
1153     size_t i;
1154 
1155     assert("a".decode(i) == 'a' && i == 1);
1156     i = 0;
1157     assert("å".decode(i) == 'å' && i == 2);
1158     i = 1;
1159     assert("aå".decode(i) == 'å' && i == 3);
1160     i = 0;
1161     assert("å"w.decode(i) == 'å' && i == 1);
1162 
1163     // ë as a multi-code point grapheme
1164     i = 0;
1165     assert("e\u0308".decode(i) == 'e' && i == 1);
1166     // ë as a single code point grapheme
1167     i = 0;
1168     assert("ë".decode(i) == 'ë' && i == 2);
1169     i = 0;
1170     assert("ë"w.decode(i) == 'ë' && i == 1);
1171 }
1172 
1173 /++
1174     `decodeFront` is a variant of $(LREF decode) which specifically decodes
1175     the first code point. Unlike $(LREF decode), `decodeFront` accepts any
1176     $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
1177     of code units (rather than just a string or random access
1178     range). It also takes the range by `ref` and pops off the elements as it
1179     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1180     of code units which were in the code point which was decoded.
1181 
1182     Params:
1183         useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1184         str = input string or indexable Range
1185         numCodeUnits = set to number of code units processed
1186 
1187     Returns:
1188         decoded character
1189 
1190     Throws:
1191         $(LREF UTFException) if `str.front` is not the start of a valid UTF
1192         sequence. If an exception is thrown, then there is no guarantee as to
1193         the number of code units which were popped off, as it depends on the
1194         type of range being used and how many code units had to be popped off
1195         before the code point was determined to be invalid.
1196   +/
1197 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1198 ref S str, out size_t numCodeUnits)
1199 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
1200 in
1201 {
1202     assert(!str.empty);
1203 }
1204 out (result)
1205 {
1206     assert(isValidDchar(result));
1207 }
1208 do
1209 {
1210     immutable fst = str.front;
1211 
1212     if (fst < codeUnitLimit!S)
1213     {
1214         str.popFront();
1215         numCodeUnits = 1;
1216         return fst;
1217     }
1218     else
1219     {
1220         // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
1221         // done outside of decodeImpl, which is undesirable, since not all
1222         // overloads of decodeImpl need it. So, it should be moved back into
1223         // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
1224         // has been fixed.
1225         enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
1226         immutable retval = decodeImpl!(canIndex, useReplacementDchar)(cast(TypeForDecode!S) str, numCodeUnits);
1227 
1228         // The other range types were already popped by decodeImpl.
1229         static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1230             str = str[numCodeUnits .. str.length];
1231 
1232         return retval;
1233     }
1234 }
1235 
1236 /// ditto
1237 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1238 ref S str, out size_t numCodeUnits) @trusted pure
1239 if (isSomeString!S)
1240 in
1241 {
1242     assert(!str.empty);
1243 }
1244 out (result)
1245 {
1246     assert(isValidDchar(result));
1247 }
1248 do
1249 {
1250     if (str[0] < codeUnitLimit!S)
1251     {
1252         numCodeUnits = 1;
1253         immutable retval = str[0];
1254         str = str[1 .. $];
1255         return retval;
1256     }
1257     else
1258     {
1259         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, numCodeUnits);
1260         str = str[numCodeUnits .. $];
1261         return retval;
1262     }
1263 }
1264 
1265 /++ Ditto +/
1266 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1267 if (isInputRange!S && isSomeChar!(ElementType!S))
1268 {
1269     size_t numCodeUnits;
1270     return decodeFront!useReplacementDchar(str, numCodeUnits);
1271 }
1272 
1273 ///
1274 @safe pure unittest
1275 {
1276     import std.range.primitives;
1277     string str = "Hello, World!";
1278 
1279     assert(str.decodeFront == 'H' && str == "ello, World!");
1280     str = "å";
1281     assert(str.decodeFront == 'å' && str.empty);
1282     str = "å";
1283     size_t i;
1284     assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
1285 }
1286 
1287 /++
1288     `decodeBack` is a variant of $(LREF decode) which specifically decodes
1289     the last code point. Unlike $(LREF decode), `decodeBack` accepts any
1290     bidirectional range of code units (rather than just a string or random access
1291     range). It also takes the range by `ref` and pops off the elements as it
1292     decodes them. If `numCodeUnits` is passed in, it gets set to the number
1293     of code units which were in the code point which was decoded.
1294 
1295     Params:
1296         useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
1297         str = input string or bidirectional Range
1298         numCodeUnits = gives the number of code units processed
1299 
1300     Returns:
1301         A decoded UTF character.
1302 
1303     Throws:
1304         $(LREF UTFException) if `str.back` is not the end of a valid UTF
1305         sequence. If an exception is thrown, the `str` itself remains unchanged,
1306         but there is no guarantee as to the value of `numCodeUnits` (when passed).
1307   +/
1308 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1309     ref S str, out size_t numCodeUnits)
1310 if (isSomeString!S)
1311 in
1312 {
1313     assert(!str.empty);
1314 }
1315 out (result)
1316 {
1317     assert(isValidDchar(result));
1318 }
1319 do
1320 {
1321     if (str[$ - 1] < codeUnitLimit!S)
1322     {
1323         numCodeUnits = 1;
1324         immutable retval = str[$ - 1];
1325         str = str[0 .. $ - 1];
1326         return retval;
1327     }
1328     else
1329     {
1330         numCodeUnits = strideBack(str);
1331         immutable newLength = str.length - numCodeUnits;
1332         size_t index = newLength;
1333         immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index);
1334         str = str[0 .. newLength];
1335         return retval;
1336     }
1337 }
1338 
1339 /++ Ditto +/
1340 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1341     ref S str, out size_t numCodeUnits)
1342 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
1343     && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
1344 in
1345 {
1346     assert(!str.empty);
1347 }
1348 out (result)
1349 {
1350     assert(isValidDchar(result));
1351 }
1352 do
1353 {
1354     if (str.back < codeUnitLimit!S)
1355     {
1356         numCodeUnits = 1;
1357         immutable retval = str.back;
1358         str.popBack();
1359         return retval;
1360     }
1361     else
1362     {
1363         numCodeUnits = strideBack(str);
1364         static if (isRandomAccessRange!S)
1365         {
1366             size_t index = str.length - numCodeUnits;
1367             immutable retval = decodeImpl!(true, useReplacementDchar)(cast(TypeForDecode!S) str, index);
1368             str.popBackExactly(numCodeUnits);
1369             return retval;
1370         }
1371         else
1372         {
1373             alias Char = Unqual!(ElementType!S);
1374             Char[4] codeUnits;
1375             S tmp = str.save;
1376             for (size_t i = numCodeUnits; i > 0; )
1377             {
1378                 codeUnits[--i] = tmp.back;
1379                 tmp.popBack();
1380             }
1381             const Char[] codePoint = codeUnits[0 .. numCodeUnits];
1382             size_t index = 0;
1383             immutable retval = decodeImpl!(true, useReplacementDchar)(
1384                 cast(TypeForDecode!(typeof(codePoint))) codePoint, index);
1385             str = tmp;
1386             return retval;
1387         }
1388     }
1389 }
1390 
1391 /++ Ditto +/
1392 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1393 if (isSomeString!S
1394     || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
1395     || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
1396 in
1397 {
1398     assert(!str.empty);
1399 }
1400 out (result)
1401 {
1402     assert(isValidDchar(result));
1403 }
1404 do
1405 {
1406     size_t numCodeUnits;
1407     return decodeBack!useReplacementDchar(str, numCodeUnits);
1408 }
1409 
1410 ///
1411 @system pure unittest
1412 {
1413     import std.range.primitives;
1414     string str = "Hello, World!";
1415 
1416     assert(str.decodeBack == '!' && str == "Hello, World");
1417     str = "å";
1418     assert(str.decodeBack == 'å' && str.empty);
1419     str = "å";
1420     size_t i;
1421     assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
1422 }
1423 
1424 // Gives the maximum value that a code unit for the given range type can hold.
1425 package template codeUnitLimit(S)
1426 if (isSomeChar!(ElementEncodingType!S))
1427 {
1428     static if (is(immutable ElementEncodingType!S == immutable char))
1429         enum char codeUnitLimit = 0x80;
1430     else static if (is(immutable ElementEncodingType!S == immutable wchar))
1431         enum wchar codeUnitLimit = 0xD800;
1432     else
1433         enum dchar codeUnitLimit = 0xD800;
1434 }
1435 
1436 /*
1437  * For strings, this function does its own bounds checking to give a
1438  * more useful error message when attempting to decode past the end of a string.
1439  * Subsequently it uses a pointer instead of an array to avoid
1440  * redundant bounds checking.
1441  *
1442  * The three overloads of this operate on chars, wchars, and dchars.
1443  *
1444  * Params:
1445  *      canIndex = if S is indexable
1446  *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
1447  *      str = input string or Range
1448  *      index = starting index into s[]; incremented by number of code units processed
1449  *
1450  * Returns:
1451  *      decoded character
1452  */
1453 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1454     auto ref S str, ref size_t index)
1455 if (
1456     is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
1457 {
1458     /* The following encodings are valid, except for the 5 and 6 byte
1459      * combinations:
1460      *  0xxxxxxx
1461      *  110xxxxx 10xxxxxx
1462      *  1110xxxx 10xxxxxx 10xxxxxx
1463      *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1464      *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1465      *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1466      */
1467 
1468     /* Dchar bitmask for different numbers of UTF-8 code units.
1469      */
1470     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1471 
1472     static if (is(S : const char[]))
1473         auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
1474     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1475         auto pstr = str[index .. str.length];
1476     else
1477         alias pstr = str;
1478 
1479     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1480     // outside of decodeImpl
1481     //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1482 
1483     static if (canIndex)
1484     {
1485         immutable length = str.length - index;
1486         ubyte fst = pstr[0];
1487     }
1488     else
1489     {
1490         ubyte fst = pstr.front;
1491         pstr.popFront();
1492     }
1493 
1494     static if (!useReplacementDchar)
1495     {
1496         static if (canIndex)
1497         {
1498             static UTFException exception(S)(S str, string msg)
1499             {
1500                 uint[4] sequence = void;
1501                 size_t i;
1502 
1503                 do
1504                 {
1505                     sequence[i] = str[i];
1506                 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
1507 
1508                 return new UTFException(msg, i).setSequence(sequence[0 .. i]);
1509             }
1510         }
1511 
1512         UTFException invalidUTF()
1513         {
1514             static if (canIndex)
1515                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
1516             else
1517             {
1518                 //We can't include the invalid sequence with input strings without
1519                 //saving each of the code units along the way, and we can't do it with
1520                 //forward ranges without saving the entire range. Both would incur a
1521                 //cost for the decoding of every character just to provide a better
1522                 //error message for the (hopefully) rare case when an invalid UTF-8
1523                 //sequence is encountered, so we don't bother trying to include the
1524                 //invalid sequence here, unlike with strings and sliceable ranges.
1525                return new UTFException("Invalid UTF-8 sequence");
1526             }
1527         }
1528 
1529         UTFException outOfBounds()
1530         {
1531             static if (canIndex)
1532                return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
1533             else
1534                return new UTFException("Attempted to decode past the end of a string");
1535         }
1536     }
1537 
1538     if ((fst & 0b1100_0000) != 0b1100_0000)
1539     {
1540         static if (useReplacementDchar)
1541         {
1542             ++index;            // always consume bad input to avoid infinite loops
1543             return replacementDchar;
1544         }
1545         else
1546             throw invalidUTF(); // starter must have at least 2 first bits set
1547     }
1548     ubyte tmp = void;
1549     dchar d = fst; // upper control bits are masked out later
1550     fst <<= 1;
1551 
1552     foreach (i; AliasSeq!(1, 2, 3))
1553     {
1554 
1555         static if (canIndex)
1556         {
1557             if (i == length)
1558             {
1559                 static if (useReplacementDchar)
1560                 {
1561                     index += i;
1562                     return replacementDchar;
1563                 }
1564                 else
1565                     throw outOfBounds();
1566             }
1567         }
1568         else
1569         {
1570             if (pstr.empty)
1571             {
1572                 static if (useReplacementDchar)
1573                 {
1574                     index += i;
1575                     return replacementDchar;
1576                 }
1577                 else
1578                     throw outOfBounds();
1579             }
1580         }
1581 
1582         static if (canIndex)
1583             tmp = pstr[i];
1584         else
1585         {
1586             tmp = pstr.front;
1587             pstr.popFront();
1588         }
1589 
1590         if ((tmp & 0xC0) != 0x80)
1591         {
1592             static if (useReplacementDchar)
1593             {
1594                 index += i + 1;
1595                 return replacementDchar;
1596             }
1597             else
1598                 throw invalidUTF();
1599         }
1600 
1601         d = (d << 6) | (tmp & 0x3F);
1602         fst <<= 1;
1603 
1604         if (!(fst & 0x80)) // no more bytes
1605         {
1606             d &= bitMask[i]; // mask out control bits
1607 
1608             // overlong, could have been encoded with i bytes
1609             if ((d & ~bitMask[i - 1]) == 0)
1610             {
1611                 static if (useReplacementDchar)
1612                 {
1613                     index += i + 1;
1614                     return replacementDchar;
1615                 }
1616                 else
1617                     throw invalidUTF();
1618             }
1619 
1620             // check for surrogates only needed for 3 bytes
1621             static if (i == 2)
1622             {
1623                 if (!isValidDchar(d))
1624                 {
1625                     static if (useReplacementDchar)
1626                     {
1627                         index += i + 1;
1628                         return replacementDchar;
1629                     }
1630                     else
1631                         throw invalidUTF();
1632                 }
1633             }
1634 
1635             index += i + 1;
1636             static if (i == 3)
1637             {
1638                 if (d > dchar.max)
1639                 {
1640                     static if (useReplacementDchar)
1641                         d = replacementDchar;
1642                     else
1643                         throw invalidUTF();
1644                 }
1645             }
1646             return d;
1647         }
1648     }
1649 
1650     static if (useReplacementDchar)
1651     {
1652         index += 4;             // read 4 chars by now
1653         return replacementDchar;
1654     }
1655     else
1656         throw invalidUTF();
1657 }
1658 
1659 @safe pure @nogc nothrow
1660 unittest
1661 {
1662     // Add tests for useReplacemendDchar == yes path
1663 
1664     static struct R
1665     {
1666       @safe pure @nogc nothrow:
1667         this(string s) { this.s = s; }
1668         @property bool empty() { return idx == s.length; }
1669         @property char front() { return s[idx]; }
1670         void popFront() { ++idx; }
1671         size_t idx;
1672         string s;
1673     }
1674 
1675     foreach (s; invalidUTFstrings!char())
1676     {
1677         auto r = R(s);
1678         size_t index;
1679         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1680         assert(dc == replacementDchar);
1681         assert(1 <= index && index <= s.length);
1682     }
1683 }
1684 
1685 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
1686 (auto ref S str, ref size_t index)
1687 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
1688 {
1689     static if (is(S : const wchar[]))
1690         auto pstr = str.ptr + index;
1691     else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
1692         auto pstr = str[index .. str.length];
1693     else
1694         alias pstr = str;
1695 
1696     // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
1697     // outside of decodeImpl
1698     //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
1699 
1700     static if (canIndex)
1701     {
1702         immutable length = str.length - index;
1703         uint u = pstr[0];
1704     }
1705     else
1706     {
1707         uint u = pstr.front;
1708         pstr.popFront();
1709     }
1710 
1711     static if (!useReplacementDchar)
1712     {
1713         UTFException exception(string msg)
1714         {
1715             static if (canIndex)
1716                 return new UTFException(msg).setSequence(pstr[0]);
1717             else
1718                 return new UTFException(msg);
1719         }
1720     }
1721 
1722     // The < case must be taken care of before decodeImpl is called.
1723     assert(u >= 0xD800);
1724 
1725     if (u <= 0xDBFF)
1726     {
1727         static if (canIndex)
1728             immutable onlyOneCodeUnit = length == 1;
1729         else
1730             immutable onlyOneCodeUnit = pstr.empty;
1731 
1732         if (onlyOneCodeUnit)
1733         {
1734             static if (useReplacementDchar)
1735             {
1736                 ++index;
1737                 return replacementDchar;
1738             }
1739             else
1740                 throw exception("surrogate UTF-16 high value past end of string");
1741         }
1742 
1743         static if (canIndex)
1744             immutable uint u2 = pstr[1];
1745         else
1746         {
1747             immutable uint u2 = pstr.front;
1748             pstr.popFront();
1749         }
1750 
1751         if (u2 < 0xDC00 || u2 > 0xDFFF)
1752         {
1753             static if (useReplacementDchar)
1754                 u = replacementDchar;
1755             else
1756                 throw exception("surrogate UTF-16 low value out of range");
1757         }
1758         else
1759             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1760         ++index;
1761     }
1762     else if (u >= 0xDC00 && u <= 0xDFFF)
1763     {
1764         static if (useReplacementDchar)
1765             u = replacementDchar;
1766         else
1767             throw exception("unpaired surrogate UTF-16 value");
1768     }
1769     ++index;
1770 
1771     // Note: u+FFFE and u+FFFF are specifically permitted by the
1772     // Unicode standard for application internal use (see isValidDchar)
1773 
1774     return cast(dchar) u;
1775 }
1776 
1777 @safe pure @nogc nothrow
1778 unittest
1779 {
1780     // Add tests for useReplacemendDchar == true path
1781 
1782     static struct R
1783     {
1784       @safe pure @nogc nothrow:
1785         this(wstring s) { this.s = s; }
1786         @property bool empty() { return idx == s.length; }
1787         @property wchar front() { return s[idx]; }
1788         void popFront() { ++idx; }
1789         size_t idx;
1790         wstring s;
1791     }
1792 
1793     foreach (s; invalidUTFstrings!wchar())
1794     {
1795         auto r = R(s);
1796         size_t index;
1797         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1798         assert(dc == replacementDchar);
1799         assert(1 <= index && index <= s.length);
1800     }
1801 }
1802 
1803 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1804     auto ref S str, ref size_t index)
1805 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
1806 {
1807     static if (is(S : const dchar[]))
1808         auto pstr = str.ptr;
1809     else
1810         alias pstr = str;
1811 
1812     static if (is(S : const dchar[]) || isRandomAccessRange!S)
1813     {
1814         dchar dc = pstr[index];
1815         if (!isValidDchar(dc))
1816         {
1817             static if (useReplacementDchar)
1818                 dc = replacementDchar;
1819             else
1820                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1821         }
1822         ++index;
1823         return dc;
1824     }
1825     else
1826     {
1827         dchar dc = pstr.front;
1828         if (!isValidDchar(dc))
1829         {
1830             static if (useReplacementDchar)
1831                 dc = replacementDchar;
1832             else
1833                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1834         }
1835         ++index;
1836         pstr.popFront();
1837         return dc;
1838     }
1839 }
1840 
1841 @safe pure @nogc nothrow
1842 unittest
1843 {
1844     // Add tests for useReplacemendDchar == true path
1845 
1846     static struct R
1847     {
1848       @safe pure @nogc nothrow:
1849         this(dstring s) { this.s = s; }
1850         @property bool empty() { return idx == s.length; }
1851         @property dchar front() { return s[idx]; }
1852         void popFront() { ++idx; }
1853         size_t idx;
1854         dstring s;
1855     }
1856 
1857     foreach (s; invalidUTFstrings!dchar())
1858     {
1859         auto r = R(s);
1860         size_t index;
1861         dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
1862         assert(dc == replacementDchar);
1863         assert(1 <= index && index <= s.length);
1864     }
1865 }
1866 
1867 
1868 version (StdUnittest) private void testDecode(R)(R range,
1869                                              size_t index,
1870                                              dchar expectedChar,
1871                                              size_t expectedIndex,
1872                                              size_t line = __LINE__)
1873 {
1874     import core.exception : AssertError;
1875     import std.exception : enforce;
1876     import std..string : format;
1877     import std.traits : isNarrowString;
1878 
1879     static if (hasLength!R)
1880         immutable lenBefore = range.length;
1881 
1882     static if (isRandomAccessRange!R && !isNarrowString!R)
1883     {
1884         {
1885             immutable result = decode(range, index);
1886             enforce(result == expectedChar,
1887                     new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
1888             enforce(index == expectedIndex,
1889                     new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1890             static if (hasLength!R)
1891             {
1892                 enforce(range.length == lenBefore,
1893                         new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
1894             }
1895         }
1896     }
1897 }
1898 
1899 version (StdUnittest) private void testDecodeFront(R)(ref R range,
1900                                                   dchar expectedChar,
1901                                                   size_t expectedNumCodeUnits,
1902                                                   size_t line = __LINE__)
1903 {
1904     import core.exception : AssertError;
1905     import std.exception : enforce;
1906     import std..string : format;
1907 
1908     static if (hasLength!R)
1909         immutable lenBefore = range.length;
1910 
1911     size_t numCodeUnits;
1912     immutable result = decodeFront(range, numCodeUnits);
1913     enforce(result == expectedChar,
1914             new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
1915     enforce(numCodeUnits == expectedNumCodeUnits,
1916             new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1917 
1918     static if (hasLength!R)
1919     {
1920         enforce(range.length == lenBefore - numCodeUnits,
1921                 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
1922     }
1923 }
1924 
1925 version (StdUnittest) private void testDecodeBack(R)(ref R range,
1926                                                  dchar expectedChar,
1927                                                  size_t expectedNumCodeUnits,
1928                                                  size_t line = __LINE__)
1929 {
1930     // This condition is to allow unit testing all `decode` functions together
1931     static if (!isBidirectionalRange!R)
1932         return;
1933     else
1934     {
1935         import core.exception : AssertError;
1936         import std.exception : enforce;
1937         import std..string : format;
1938 
1939         static if (hasLength!R)
1940             immutable lenBefore = range.length;
1941 
1942         size_t numCodeUnits;
1943         immutable result = decodeBack(range, numCodeUnits);
1944         enforce(result == expectedChar,
1945                 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
1946         enforce(numCodeUnits == expectedNumCodeUnits,
1947                 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
1948 
1949         static if (hasLength!R)
1950         {
1951             enforce(range.length == lenBefore - numCodeUnits,
1952                     new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
1953         }
1954     }
1955 }
1956 
1957 version (StdUnittest) private void testAllDecode(R)(R range,
1958                                                 dchar expectedChar,
1959                                                 size_t expectedIndex,
1960                                                 size_t line = __LINE__)
1961 {
1962     testDecode(range, 0, expectedChar, expectedIndex, line);
1963     static if (isBidirectionalRange!R)
1964     {
1965         auto rangeCopy = range.save;
1966         testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
1967     }
1968     testDecodeFront(range, expectedChar, expectedIndex, line);
1969 }
1970 
1971 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
1972 {
1973     import core.exception : AssertError;
1974     import std.exception : assertThrown, enforce;
1975     import std..string : format;
1976 
1977     immutable initialIndex = index;
1978 
1979     static if (hasLength!R)
1980         immutable lenBefore = range.length;
1981 
1982     static if (isRandomAccessRange!R)
1983     {
1984         assertThrown!UTFException(decode(range, index), null, __FILE__, line);
1985         enforce(index == initialIndex,
1986                 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
1987         static if (hasLength!R)
1988         {
1989             enforce(range.length == lenBefore,
1990                     new AssertError(format("decode: length changed:", range.length), __FILE__, line));
1991         }
1992     }
1993 
1994     if (initialIndex == 0)
1995         assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
1996 }
1997 
1998 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
1999 {
2000     // This condition is to allow unit testing all `decode` functions together
2001     static if (!isBidirectionalRange!R)
2002         return;
2003     else
2004     {
2005         import core.exception : AssertError;
2006         import std.exception : assertThrown, enforce;
2007         import std..string : format;
2008 
2009         static if (hasLength!R)
2010             immutable lenBefore = range.length;
2011 
2012         static if (isRandomAccessRange!R)
2013         {
2014             assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
2015             static if (hasLength!R)
2016             {
2017                 enforce(range.length == lenBefore,
2018                         new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
2019             }
2020         }
2021     }
2022 }
2023 
2024 @system unittest
2025 {
2026     import std.conv : to;
2027     import std.exception;
2028 
2029     assertCTFEable!(
2030     {
2031     foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
2032                           (string s) => new RefBidirCU!char(s),
2033                           (string s) => new RefRandomCU!char(s)))
2034     {
2035         enum sHasLength = hasLength!(typeof(S("abcd")));
2036 
2037         {
2038             auto range = S("abcd");
2039             testDecode(range, 0, 'a', 1);
2040             testDecode(range, 1, 'b', 2);
2041             testDecodeFront(range, 'a', 1);
2042             testDecodeFront(range, 'b', 1);
2043             assert(decodeFront(range) == 'c');
2044             assert(decodeFront(range) == 'd');
2045         }
2046 
2047         {
2048             auto range = S("ウェブサイト");
2049             testDecode(range, 0, 'ウ', 3);
2050             testDecode(range, 3, 'ェ', 6);
2051             testDecodeFront(range, 'ウ', 3);
2052             testDecodeFront(range, 'ェ', 3);
2053             assert(decodeFront(range) == 'ブ');
2054             assert(decodeFront(range) == 'サ');
2055         }
2056 
2057         {
2058             auto range = S("abcd");
2059             testDecodeBack(range, 'd', 1);
2060             testDecodeBack(range, 'c', 1);
2061             testDecodeBack(range, 'b', 1);
2062             testDecodeBack(range, 'a', 1);
2063         }
2064 
2065         {
2066             auto range = S("ウェブサイト");
2067             testDecodeBack(range, 'ト', 3);
2068             testDecodeBack(range, 'イ', 3);
2069             testDecodeBack(range, 'サ', 3);
2070             testDecodeBack(range, 'ブ', 3);
2071         }
2072 
2073         testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
2074         testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
2075 
2076         foreach (str; ["\xE2\x89", // too short
2077                        "\xC0\x8A",
2078                        "\xE0\x80\x8A",
2079                        "\xF0\x80\x80\x8A",
2080                        "\xF8\x80\x80\x80\x8A",
2081                        "\xFC\x80\x80\x80\x80\x8A"])
2082         {
2083             testBadDecode(S(str), 0);
2084             testBadDecode(S(str), 1);
2085             testBadDecodeBack(S(str));
2086         }
2087 
2088         //Invalid UTF-8 sequence where the first code unit is valid.
2089         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
2090         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
2091 
2092         //Invalid UTF-8 sequence where the first code unit isn't valid.
2093         foreach (str; ["\xED\xA0\x80",
2094                        "\xED\xAD\xBF",
2095                        "\xED\xAE\x80",
2096                        "\xED\xAF\xBF",
2097                        "\xED\xB0\x80",
2098                        "\xED\xBE\x80",
2099                        "\xED\xBF\xBF"])
2100         {
2101             testBadDecode(S(str), 0);
2102             testBadDecodeBack(S(str));
2103         }
2104     }
2105     });
2106 }
2107 
2108 @system unittest
2109 {
2110     import std.exception;
2111     assertCTFEable!(
2112     {
2113     foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
2114                           (wstring s) => new RefBidirCU!wchar(s),
2115                           (wstring s) => new RefRandomCU!wchar(s)))
2116     {
2117         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
2118         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
2119         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
2120         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2121         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2122 
2123         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
2124         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
2125 
2126         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
2127         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2128 
2129         {
2130             auto range = S("ウェブサイト");
2131             testDecode(range, 0, 'ウ', 1);
2132             testDecode(range, 1, 'ェ', 2);
2133             testDecodeFront(range, 'ウ', 1);
2134             testDecodeFront(range, 'ェ', 1);
2135             assert(decodeFront(range) == 'ブ');
2136             assert(decodeFront(range) == 'サ');
2137         }
2138 
2139         {
2140             auto range = S("ウェブサイト");
2141             testDecodeBack(range, 'ト', 1);
2142             testDecodeBack(range, 'イ', 1);
2143             testDecodeBack(range, 'サ', 1);
2144             testDecodeBack(range, 'ブ', 1);
2145         }
2146     }
2147 
2148     foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
2149     {
2150         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2151                       cast(wchar) 0x1400,
2152                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2153         testDecode(str, 0, cast(dchar) 0x10000, 2);
2154         testDecode(str, 2, cast(dchar) 0x1400, 3);
2155         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2156         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2157         testDecodeBack(str, cast(dchar) 0x1400, 1);
2158         testDecodeBack(str, cast(dchar) 0x10000, 2);
2159     }
2160     });
2161 }
2162 
2163 @system unittest
2164 {
2165     import std.exception;
2166     assertCTFEable!(
2167     {
2168     foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
2169                           (dstring s) => new RefBidirCU!dchar(s),
2170                           (dstring s) => new RefRandomCU!dchar(s)))
2171     {
2172         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2173         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2174         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2175         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2176         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2177 
2178         testBadDecode(S([cast(dchar) 0xD800]), 0);
2179         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2180         testBadDecode(S([cast(dchar) 0x110000]), 0);
2181 
2182         testBadDecodeBack(S([cast(dchar) 0xD800]));
2183         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2184         testBadDecodeBack(S([cast(dchar) 0x110000]));
2185 
2186         {
2187             auto range = S("ウェブサイト");
2188             testDecode(range, 0, 'ウ', 1);
2189             testDecode(range, 1, 'ェ', 2);
2190             testDecodeFront(range, 'ウ', 1);
2191             testDecodeFront(range, 'ェ', 1);
2192             assert(decodeFront(range) == 'ブ');
2193             assert(decodeFront(range) == 'サ');
2194         }
2195 
2196         {
2197             auto range = S("ウェブサイト");
2198             testDecodeBack(range, 'ト', 1);
2199             testDecodeBack(range, 'イ', 1);
2200             testDecodeBack(range, 'サ', 1);
2201             testDecodeBack(range, 'ブ', 1);
2202         }
2203     }
2204 
2205     foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
2206     {
2207         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2208         testDecode(str, 0, 0x10000, 1);
2209         testDecode(str, 1, 0x1400, 2);
2210         testDecode(str, 2, 0xB9DDE, 3);
2211         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2212         testDecodeBack(str, cast(dchar) 0x1400, 1);
2213         testDecodeBack(str, cast(dchar) 0x10000, 1);
2214     }
2215     });
2216 }
2217 
2218 @safe unittest
2219 {
2220     import std.exception;
2221     import std.traits : FunctionAttribute, functionAttributes, isSafe;
2222     assertCTFEable!(
2223     {
2224     foreach (S; AliasSeq!( char[], const( char)[],  string,
2225                           wchar[], const(wchar)[], wstring,
2226                           dchar[], const(dchar)[], dstring))
2227     {
2228         static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
2229         static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
2230         static assert(isSafe!({ S str; decodeFront(str); }));
2231         static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
2232         static assert((functionAttributes!({
2233             S str; size_t i = 0; decodeFront(str, i);
2234         }) & FunctionAttribute.pure_) != 0);
2235         static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
2236         static assert((functionAttributes!({
2237             S str; size_t i = 0; decodeBack(str, i);
2238         }) & FunctionAttribute.pure_) != 0);
2239         static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
2240     }
2241     });
2242 }
2243 
2244 @safe unittest
2245 {
2246     import std.exception;
2247     char[4] val;
2248     val[0] = 0b1111_0111;
2249     val[1] = 0b1011_1111;
2250     val[2] = 0b1011_1111;
2251     val[3] = 0b1011_1111;
2252     size_t i = 0;
2253     assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
2254 }
2255 /* =================== Encode ======================= */
2256 
2257 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
2258 {
2259     static if (useReplacementDchar)
2260         return replacementDchar;
2261     else
2262         throw new UTFException(msg).setSequence(c);
2263 }
2264 
2265 /++
2266     Encodes `c` into the static array, `buf`, and returns the actual
2267     length of the encoded character (a number between `1` and `4` for
2268     `char[4]` buffers and a number between `1` and `2` for
2269     `wchar[2]` buffers).
2270 
2271     Throws:
2272         `UTFException` if `c` is not a valid UTF code point.
2273   +/
2274 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2275     out char[4] buf, dchar c) @safe pure
2276 {
2277     if (c <= 0x7F)
2278     {
2279         assert(isValidDchar(c));
2280         buf[0] = cast(char) c;
2281         return 1;
2282     }
2283     if (c <= 0x7FF)
2284     {
2285         assert(isValidDchar(c));
2286         buf[0] = cast(char)(0xC0 | (c >> 6));
2287         buf[1] = cast(char)(0x80 | (c & 0x3F));
2288         return 2;
2289     }
2290     if (c <= 0xFFFF)
2291     {
2292         if (0xD800 <= c && c <= 0xDFFF)
2293             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2294 
2295         assert(isValidDchar(c));
2296     L3:
2297         buf[0] = cast(char)(0xE0 | (c >> 12));
2298         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2299         buf[2] = cast(char)(0x80 | (c & 0x3F));
2300         return 3;
2301     }
2302     if (c <= 0x10FFFF)
2303     {
2304         assert(isValidDchar(c));
2305         buf[0] = cast(char)(0xF0 | (c >> 18));
2306         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2307         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2308         buf[3] = cast(char)(0x80 | (c & 0x3F));
2309         return 4;
2310     }
2311 
2312     assert(!isValidDchar(c));
2313     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2314     goto L3;
2315 }
2316 
2317 ///
2318 @safe unittest
2319 {
2320     import std.exception : assertThrown;
2321     import std.typecons : Yes;
2322 
2323     char[4] buf;
2324 
2325     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2326     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2327     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2328     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2329     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2330     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2331 
2332     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2333     auto slice = buf[];
2334     assert(slice.decodeFront == replacementDchar);
2335 }
2336 
2337 ///
2338 @safe unittest
2339 {
2340     import std.exception : assertThrown;
2341     import std.typecons : Yes;
2342 
2343     wchar[2] buf;
2344 
2345     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2346     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2347     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2348     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2349     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2350     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2351 
2352     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2353     auto slice = buf[];
2354     assert(slice.decodeFront == replacementDchar);
2355 }
2356 
2357 ///
2358 @safe unittest
2359 {
2360     import std.exception : assertThrown;
2361     import std.typecons : Yes;
2362 
2363     dchar[1] buf;
2364 
2365     assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
2366     assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
2367     assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
2368     assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
2369     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2370 
2371     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2372     assert(buf[0] == replacementDchar);
2373 }
2374 
2375 @safe unittest
2376 {
2377     import std.exception;
2378     assertCTFEable!(
2379     {
2380     char[4] buf;
2381 
2382     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2383     assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
2384     assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
2385     assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
2386     assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
2387     assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
2388     assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
2389     assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
2390     assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
2391     assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
2392     assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
2393 
2394     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2395     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2396     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2397     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2398     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2399 
2400     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2401     enum replacementDcharString = "\uFFFD";
2402     assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
2403     });
2404 }
2405 
2406 
2407 /// Ditto
2408 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2409     out wchar[2] buf, dchar c) @safe pure
2410 {
2411     if (c <= 0xFFFF)
2412     {
2413         if (0xD800 <= c && c <= 0xDFFF)
2414             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2415 
2416         assert(isValidDchar(c));
2417     L1:
2418         buf[0] = cast(wchar) c;
2419         return 1;
2420     }
2421     if (c <= 0x10FFFF)
2422     {
2423         assert(isValidDchar(c));
2424         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2425         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2426         return 2;
2427     }
2428 
2429     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2430     goto L1;
2431 }
2432 
2433 @safe unittest
2434 {
2435     import std.exception;
2436     assertCTFEable!(
2437     {
2438     wchar[2] buf;
2439 
2440     assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
2441     assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
2442     assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
2443     assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
2444     assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
2445     assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
2446     assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
2447 
2448     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2449     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2450     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2451     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2452     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2453 
2454     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2455     assert(buf.front == replacementDchar);
2456     });
2457 }
2458 
2459 
2460 /// Ditto
2461 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2462     out dchar[1] buf, dchar c) @safe pure
2463 {
2464     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2465         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2466     else
2467         assert(isValidDchar(c));
2468     buf[0] = c;
2469     return 1;
2470 }
2471 
2472 @safe unittest
2473 {
2474     import std.exception;
2475     assertCTFEable!(
2476     {
2477     dchar[1] buf;
2478 
2479     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2480     encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
2481     encode(buf, '\uE000'); assert(buf[0] == '\uE000');
2482     encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
2483     encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
2484     encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
2485 
2486     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2487     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2488     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2489     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2490     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2491 
2492     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2493     assert(buf.front == replacementDchar);
2494     });
2495 }
2496 
2497 
2498 /++
2499     Encodes `c` in `str`'s encoding and appends it to `str`.
2500 
2501     Throws:
2502         `UTFException` if `c` is not a valid UTF code point.
2503   +/
2504 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2505     ref char[] str, dchar c) @safe pure
2506 {
2507     char[] r = str;
2508 
2509     if (c <= 0x7F)
2510     {
2511         assert(isValidDchar(c));
2512         r ~= cast(char) c;
2513     }
2514     else
2515     {
2516         char[4] buf;
2517         uint L;
2518 
2519         if (c <= 0x7FF)
2520         {
2521             assert(isValidDchar(c));
2522             buf[0] = cast(char)(0xC0 | (c >> 6));
2523             buf[1] = cast(char)(0x80 | (c & 0x3F));
2524             L = 2;
2525         }
2526         else if (c <= 0xFFFF)
2527         {
2528             if (0xD800 <= c && c <= 0xDFFF)
2529                 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2530 
2531             assert(isValidDchar(c));
2532         L3:
2533             buf[0] = cast(char)(0xE0 | (c >> 12));
2534             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2535             buf[2] = cast(char)(0x80 | (c & 0x3F));
2536             L = 3;
2537         }
2538         else if (c <= 0x10FFFF)
2539         {
2540             assert(isValidDchar(c));
2541             buf[0] = cast(char)(0xF0 | (c >> 18));
2542             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2543             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2544             buf[3] = cast(char)(0x80 | (c & 0x3F));
2545             L = 4;
2546         }
2547         else
2548         {
2549             assert(!isValidDchar(c));
2550             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2551             goto L3;
2552         }
2553         r ~= buf[0 .. L];
2554     }
2555     str = r;
2556 }
2557 
2558 ///
2559 @safe unittest
2560 {
2561     char[] s = "abcd".dup;
2562     dchar d1 = 'a';
2563     dchar d2 = 'ø';
2564 
2565     encode(s, d1);
2566     assert(s.length == 5);
2567     assert(s == "abcda");
2568     encode(s, d2);
2569     assert(s.length == 7);
2570     assert(s == "abcdaø");
2571 }
2572 
2573 @safe unittest
2574 {
2575     import std.exception;
2576 
2577     assertCTFEable!(
2578     {
2579     char[] s = "abcd".dup;
2580     encode(s, cast(dchar)'a');
2581     assert(s.length == 5);
2582     assert(s == "abcda");
2583 
2584     encode(s, cast(dchar)'\u00A9');
2585     assert(s.length == 7);
2586     assert(s == "abcda\xC2\xA9");
2587     //assert(s == "abcda\u00A9");   // BUG: fix compiler
2588 
2589     encode(s, cast(dchar)'\u2260');
2590     assert(s.length == 10);
2591     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
2592     });
2593 }
2594 
2595 @safe unittest
2596 {
2597     import std.exception;
2598     assertCTFEable!(
2599     {
2600     char[] buf;
2601 
2602     encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
2603     encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
2604     encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
2605     encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
2606     encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
2607     encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
2608     encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
2609     encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
2610     encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
2611     encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
2612     encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
2613 
2614     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2615     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2616     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2617     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2618     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2619 
2620     enum replacementDcharString = "\uFFFD";
2621     enum rdcslen = replacementDcharString.length;
2622     assert(buf[$ - rdcslen .. $] != replacementDcharString);
2623     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2624     assert(buf[$ - rdcslen .. $] == replacementDcharString);
2625     });
2626 }
2627 
2628 /// ditto
2629 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2630     ref wchar[] str, dchar c) @safe pure
2631 {
2632     wchar[] r = str;
2633 
2634     if (c <= 0xFFFF)
2635     {
2636         if (0xD800 <= c && c <= 0xDFFF)
2637             c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2638 
2639         assert(isValidDchar(c));
2640     L1:
2641         r ~= cast(wchar) c;
2642     }
2643     else if (c <= 0x10FFFF)
2644     {
2645         wchar[2] buf;
2646 
2647         assert(isValidDchar(c));
2648         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2649         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2650         r ~= buf;
2651     }
2652     else
2653     {
2654         assert(!isValidDchar(c));
2655         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2656         goto L1;
2657     }
2658 
2659     str = r;
2660 }
2661 
2662 @safe unittest
2663 {
2664     import std.exception;
2665     assertCTFEable!(
2666     {
2667     wchar[] buf;
2668 
2669     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2670     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2671     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2672     encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
2673     encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
2674     encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
2675     encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
2676 
2677     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2678     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2679     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2680     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2681     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2682 
2683     assert(buf.back != replacementDchar);
2684     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2685     assert(buf.back == replacementDchar);
2686     });
2687 }
2688 
2689 /// ditto
2690 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2691     ref dchar[] str, dchar c) @safe pure
2692 {
2693     if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
2694         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2695     else
2696         assert(isValidDchar(c));
2697     str ~= c;
2698 }
2699 
2700 @safe unittest
2701 {
2702     import std.exception;
2703     assertCTFEable!(
2704     {
2705     dchar[] buf;
2706 
2707     encode(buf, '\u0000'); assert(buf[0] == '\u0000');
2708     encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
2709     encode(buf, '\uE000'); assert(buf[2] == '\uE000');
2710     encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
2711     encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
2712     encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
2713 
2714     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2715     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2716     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2717     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2718     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2719 
2720     assert(buf.back != replacementDchar);
2721     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2722     assert(buf.back == replacementDchar);
2723     });
2724 }
2725 
2726 
2727 /++
2728     Returns the number of code units that are required to encode the code point
2729     `c` when `C` is the character type used to encode it.
2730   +/
2731 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
2732 if (isSomeChar!C)
2733 {
2734     static if (C.sizeof == 1)
2735     {
2736         if (c <= 0x7F) return 1;
2737         if (c <= 0x7FF) return 2;
2738         if (c <= 0xFFFF) return 3;
2739         if (c <= 0x10FFFF) return 4;
2740         assert(false);
2741     }
2742     else static if (C.sizeof == 2)
2743     {
2744         return c <= 0xFFFF ? 1 : 2;
2745     }
2746     else
2747     {
2748         static assert(C.sizeof == 4);
2749         return 1;
2750     }
2751 }
2752 
2753 ///
2754 @safe pure nothrow @nogc unittest
2755 {
2756     assert(codeLength!char('a') == 1);
2757     assert(codeLength!wchar('a') == 1);
2758     assert(codeLength!dchar('a') == 1);
2759 
2760     assert(codeLength!char('\U0010FFFF') == 4);
2761     assert(codeLength!wchar('\U0010FFFF') == 2);
2762     assert(codeLength!dchar('\U0010FFFF') == 1);
2763 }
2764 
2765 
2766 /++
2767     Returns the number of code units that are required to encode `str`
2768     in a string whose character type is `C`. This is particularly useful
2769     when slicing one string with the length of another and the two string
2770     types use different character types.
2771 
2772     Params:
2773         C = the character type to get the encoding length for
2774         input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
2775         to calculate the encoding length from
2776     Returns:
2777         The number of code units in `input` when encoded to `C`
2778   +/
2779 size_t codeLength(C, InputRange)(InputRange input)
2780 if (isInputRange!InputRange && !isInfinite!InputRange && isSomeChar!(ElementType!InputRange))
2781 {
2782     alias EncType = Unqual!(ElementEncodingType!InputRange);
2783     static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
2784         return input.length;
2785     else
2786     {
2787         size_t total = 0;
2788 
2789         foreach (c; input.byDchar)
2790             total += codeLength!C(c);
2791 
2792         return total;
2793     }
2794 }
2795 
2796 ///
2797 @safe unittest
2798 {
2799     assert(codeLength!char("hello world") ==
2800            "hello world".length);
2801     assert(codeLength!wchar("hello world") ==
2802            "hello world"w.length);
2803     assert(codeLength!dchar("hello world") ==
2804            "hello world"d.length);
2805 
2806     assert(codeLength!char(`プログラミング`) ==
2807            `プログラミング`.length);
2808     assert(codeLength!wchar(`プログラミング`) ==
2809            `プログラミング`w.length);
2810     assert(codeLength!dchar(`プログラミング`) ==
2811            `プログラミング`d.length);
2812 
2813     string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
2814     wstring needle = `Être sans la verité`;
2815     assert(haystack[codeLength!char(needle) .. $] ==
2816            `, ça, ce ne serait pas bien.`);
2817 }
2818 
2819 @safe unittest
2820 {
2821     import std.algorithm.iteration : filter;
2822     import std.conv : to;
2823     import std.exception;
2824 
2825     assertCTFEable!(
2826     {
2827     foreach (S; AliasSeq!( char[], const  char[],  string,
2828                           wchar[], const wchar[], wstring,
2829                           dchar[], const dchar[], dstring))
2830     {
2831         foreach (C; AliasSeq!(char, wchar, dchar))
2832         {
2833             assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
2834             assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
2835             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
2836                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2837             assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
2838                    to!(C[])(`ウェブサイト@La_Verité.com`).length);
2839         }
2840     }
2841     });
2842 }
2843 
2844 /+
2845 Internal helper function:
2846 
2847 Returns true if it is safe to search for the Codepoint `c` inside
2848 code units, without decoding.
2849 
2850 This is a runtime check that is used an optimization in various functions,
2851 particularly, in `std.string`.
2852   +/
2853 package bool canSearchInCodeUnits(C)(dchar c)
2854 if (isSomeChar!C)
2855 {
2856     static if (C.sizeof == 1)
2857          return c <= 0x7F;
2858     else static if (C.sizeof == 2)
2859         return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
2860     else static if (C.sizeof == 4)
2861         return true;
2862     else
2863         static assert(0);
2864 }
2865 @safe unittest
2866 {
2867     assert( canSearchInCodeUnits! char('a'));
2868     assert( canSearchInCodeUnits!wchar('a'));
2869     assert( canSearchInCodeUnits!dchar('a'));
2870     assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
2871     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2872     assert( canSearchInCodeUnits!wchar('ö'));
2873     assert( canSearchInCodeUnits!dchar('ö'));
2874     assert(!canSearchInCodeUnits! char('日'));
2875     assert( canSearchInCodeUnits!wchar('日'));
2876     assert( canSearchInCodeUnits!dchar('日'));
2877     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2878     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2879     assert(!canSearchInCodeUnits! char('\U00010001'));
2880     assert(!canSearchInCodeUnits!wchar('\U00010001'));
2881     assert( canSearchInCodeUnits!dchar('\U00010001'));
2882 }
2883 
2884 /* =================== Validation ======================= */
2885 
2886 /++
2887     Checks to see if `str` is well-formed unicode or not.
2888 
2889     Throws:
2890         `UTFException` if `str` is not well-formed.
2891   +/
2892 void validate(S)(in S str) @safe pure
2893 if (isSomeString!S)
2894 {
2895     immutable len = str.length;
2896     for (size_t i = 0; i < len; )
2897     {
2898         decode(str, i);
2899     }
2900 }
2901 
2902 ///
2903 @safe unittest
2904 {
2905     import std.exception : assertThrown;
2906     char[] a = [167, 133, 175];
2907     assertThrown!UTFException(validate(a));
2908 }
2909 
2910 // https://issues.dlang.org/show_bug.cgi?id=12923
2911 @safe unittest
2912 {
2913     import std.exception;
2914     assertThrown((){
2915         char[3]a=[167, 133, 175];
2916         validate(a[]);
2917     }());
2918 }
2919 
2920 /**
2921  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2922  * string of the elements.
2923  *
2924  * Params:
2925  *     s = the string to encode
2926  * Returns:
2927  *     A UTF-8 string
2928  * See_Also:
2929  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2930  */
2931 string toUTF8(S)(S s)
2932 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2933 {
2934     return toUTFImpl!string(s);
2935 }
2936 
2937 ///
2938 @safe pure unittest
2939 {
2940     import std.algorithm.comparison : equal;
2941 
2942     // The ö is represented by two UTF-8 code units
2943     assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2944 
2945     // 𐐷 is four code units in UTF-8
2946     assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2947 }
2948 
2949 @system pure unittest
2950 {
2951     import std.algorithm.comparison : equal;
2952     import std.internal.test.dummyrange : ReferenceInputRange;
2953 
2954     alias RT = ReferenceInputRange!(ElementType!(string));
2955     auto r1 = new RT("Hellø");
2956     auto r2 = new RT("𐐷");
2957 
2958     assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2959     assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2960 }
2961 
2962 /**
2963  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
2964  * `wstring` of the elements.
2965  *
2966  * Params:
2967  *     s = the range to encode
2968  * Returns:
2969  *     A UTF-16 string
2970  * See_Also:
2971  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2972  */
2973 wstring toUTF16(S)(S s)
2974 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
2975 {
2976     return toUTFImpl!wstring(s);
2977 }
2978 
2979 ///
2980 @safe pure unittest
2981 {
2982     import std.algorithm.comparison : equal;
2983 
2984     // these graphemes are two code units in UTF-16 and one in UTF-32
2985     assert("𤭢"d.length == 1);
2986     assert("𐐷"d.length == 1);
2987 
2988     assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62]));
2989     assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37]));
2990 }
2991 
2992 @system pure unittest
2993 {
2994     import std.algorithm.comparison : equal;
2995     import std.internal.test.dummyrange : ReferenceInputRange;
2996 
2997     alias RT = ReferenceInputRange!(ElementType!(string));
2998     auto r1 = new RT("𤭢");
2999     auto r2 = new RT("𐐷");
3000 
3001     assert(r1.toUTF16.equal([0xD852, 0xDF62]));
3002     assert(r2.toUTF16.equal([0xD801, 0xDC37]));
3003 }
3004 
3005 
3006 /**
3007  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
3008  * `dstring` of the elements.
3009  *
3010  * Params:
3011  *     s = the range to encode
3012  * Returns:
3013  *     A UTF-32 string
3014  * See_Also:
3015  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
3016  */
3017 dstring toUTF32(S)(S s)
3018 if (isInputRange!S && !isInfinite!S && isSomeChar!(ElementEncodingType!S))
3019 {
3020     return toUTFImpl!dstring(s);
3021 }
3022 
3023 ///
3024 @safe pure unittest
3025 {
3026     import std.algorithm.comparison : equal;
3027 
3028     // these graphemes are two code units in UTF-16 and one in UTF-32
3029     assert("𤭢"w.length == 2);
3030     assert("𐐷"w.length == 2);
3031 
3032     assert("𤭢"w.toUTF32.equal([0x00024B62]));
3033     assert("𐐷"w.toUTF32.equal([0x00010437]));
3034 }
3035 
3036 private T toUTFImpl(T, S)(S s)
3037 {
3038     static if (is(S : T))
3039     {
3040         return s.idup;
3041     }
3042     else
3043     {
3044         import std.array : appender;
3045         auto app = appender!T();
3046 
3047         static if (hasLength!S || isSomeString!S)
3048             app.reserve(s.length);
3049 
3050         foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
3051             app.put(c);
3052 
3053         return app.data;
3054     }
3055 }
3056 
3057 /* =================== toUTFz ======================= */
3058 
3059 /++
3060     Returns a C-style zero-terminated string equivalent to `str`. `str`
3061     must not contain embedded `'\0'`'s as any C function will treat the first
3062     `'\0'` that it sees as the end of the string. If `str.empty` is
3063     `true`, then a string containing only `'\0'` is returned.
3064 
3065     `toUTFz` accepts any type of string and is templated on the type of
3066     character pointer that you wish to convert to. It will avoid allocating a
3067     new string if it can, but there's a decent chance that it will end up having
3068     to allocate a new string - particularly when dealing with character types
3069     other than `char`.
3070 
3071     $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
3072     anything alters the character one past the end of `str` (which is the
3073     `'\0'` character terminating the string), then the string won't be
3074     zero-terminated anymore. The most likely scenarios for that are if you
3075     append to `str` and no reallocation takes place or when `str` is a
3076     slice of a larger array, and you alter the character in the larger array
3077     which is one character past the end of `str`. Another case where it could
3078     occur would be if you had a mutable character array immediately after
3079     `str` in memory (for example, if they're member variables in a
3080     user-defined type with one declared right after the other) and that
3081     character array happened to start with `'\0'`. Such scenarios will never
3082     occur if you immediately use the zero-terminated string after calling
3083     `toUTFz` and the C function using it doesn't keep a reference to it.
3084     Also, they are unlikely to occur even if you save the zero-terminated string
3085     (the cases above would be among the few examples of where it could happen).
3086     However, if you save the zero-terminate string and want to be absolutely
3087     certain that the string stays zero-terminated, then simply append a
3088     `'\0'` to the string and use its `ptr` property rather than calling
3089     `toUTFz`.
3090 
3091     $(RED Warning 2:) When passing a character pointer to a C function, and the
3092     C function keeps it around for any reason, make sure that you keep a
3093     reference to it in your D code. Otherwise, it may go away during a garbage
3094     collection cycle and cause a nasty bug when the C code tries to use it.
3095   +/
3096 template toUTFz(P)
3097 {
3098     P toUTFz(S)(S str) @safe pure
3099     {
3100         return toUTFzImpl!(P, S)(str);
3101     }
3102 }
3103 
3104 ///
3105 @safe pure unittest
3106 {
3107     auto p1 = toUTFz!(char*)("hello world");
3108     auto p2 = toUTFz!(const(char)*)("hello world");
3109     auto p3 = toUTFz!(immutable(char)*)("hello world");
3110     auto p4 = toUTFz!(char*)("hello world"d);
3111     auto p5 = toUTFz!(const(wchar)*)("hello world");
3112     auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
3113 }
3114 
3115 private P toUTFzImpl(P, S)(S str) @safe pure
3116 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
3117     is(immutable typeof(*P.init) == immutable ElementEncodingType!S) &&
3118     is(immutable ElementEncodingType!S == ElementEncodingType!S))
3119 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
3120 {
3121     if (str.empty)
3122     {
3123         typeof(*P.init)[] retval = ['\0'];
3124 
3125         auto trustedPtr() @trusted { return retval.ptr; }
3126         return trustedPtr();
3127     }
3128 
3129     alias C = Unqual!(ElementEncodingType!S);
3130 
3131     //If the P is mutable, then we have to make a copy.
3132     static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
3133     {
3134         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3135     }
3136     else
3137     {
3138         if (!__ctfe)
3139         {
3140             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3141             immutable p = trustedPtrAdd(str);
3142 
3143             // Peek past end of str, if it's 0, no conversion necessary.
3144             // Note that the compiler will put a 0 past the end of static
3145             // strings, and the storage allocator will put a 0 past the end
3146             // of newly allocated char[]'s.
3147             // Is p dereferenceable? A simple test: if the p points to an
3148             // address multiple of 4, then conservatively assume the pointer
3149             // might be pointing to a new block of memory, which might be
3150             // unreadable. Otherwise, it's definitely pointing to valid
3151             // memory.
3152             if ((cast(size_t) p & 3) && *p == '\0')
3153                 return &str[0];
3154         }
3155 
3156         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
3157     }
3158 }
3159 
3160 private P toUTFzImpl(P, S)(S str) @safe pure
3161 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
3162     is(immutable typeof(*P.init) == immutable ElementEncodingType!S) &&
3163     !is(immutable ElementEncodingType!S == ElementEncodingType!S))
3164 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
3165 {
3166     alias InChar  = ElementEncodingType!S;
3167     alias OutChar = typeof(*P.init);
3168 
3169     //const(C)[] -> const(C)* or
3170     //C[] -> C* or const(C)*
3171     static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
3172                (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
3173     {
3174         if (!__ctfe)
3175         {
3176             auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
3177             auto p = trustedPtrAdd(str);
3178 
3179             if ((cast(size_t) p & 3) && *p == '\0')
3180                 return &str[0];
3181         }
3182 
3183         str ~= '\0';
3184         return &str[0];
3185     }
3186     //const(C)[] -> C* or immutable(C)* or
3187     //C[] -> immutable(C)*
3188     else
3189     {
3190         import std.array : uninitializedArray;
3191         auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
3192         copy[0 .. $ - 1] = str[];
3193         copy[$ - 1] = '\0';
3194 
3195         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
3196         return trustedCast(copy);
3197     }
3198 }
3199 
3200 private P toUTFzImpl(P, S)(S str) @safe pure
3201 if (isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
3202     !is(immutable typeof(*P.init) == immutable ElementEncodingType!S))
3203 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
3204 {
3205     import std.array : appender;
3206     auto retval = appender!(typeof(*P.init)[])();
3207 
3208     foreach (dchar c; str)
3209         retval.put(c);
3210     retval.put('\0');
3211 
3212     return () @trusted { return cast(P) retval.data.ptr; } ();
3213 }
3214 
3215 @safe pure unittest
3216 {
3217     import core.exception : AssertError;
3218     import std.algorithm;
3219     import std.conv : to;
3220     import std.exception;
3221     import std..string : format;
3222 
3223     assertCTFEable!(
3224     {
3225     foreach (S; AliasSeq!(string, wstring, dstring))
3226     {
3227         alias C = Unqual!(ElementEncodingType!S);
3228 
3229         auto s1 = to!S("hello\U00010143\u0100\U00010143");
3230         auto temp = new C[](s1.length + 1);
3231         temp[0 .. $ - 1] = s1[0 .. $];
3232         temp[$ - 1] = '\n';
3233         --temp.length;
3234         auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
3235         auto s2 = trustedAssumeUnique(temp);
3236         assert(s1 == s2);
3237 
3238         void trustedCStringAssert(P, S)(S s) @trusted
3239         {
3240             auto p = toUTFz!P(s);
3241             assert(p[0 .. s.length] == s);
3242             assert(p[s.length] == '\0');
3243         }
3244 
3245         foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
3246         {
3247             trustedCStringAssert!P(s1);
3248             trustedCStringAssert!P(s2);
3249         }
3250     }
3251     });
3252 
3253     static void test(P, S)(S s, size_t line = __LINE__) @trusted
3254     {
3255         static size_t zeroLen(C)(const(C)* ptr) @trusted
3256         {
3257             size_t len = 0;
3258             while (*ptr != '\0') { ++ptr; ++len; }
3259             return len;
3260         }
3261 
3262         auto p = toUTFz!P(s);
3263         immutable len = zeroLen(p);
3264         enforce(cmp(s, p[0 .. len]) == 0,
3265                 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
3266                                 __FILE__, line));
3267     }
3268 
3269     assertCTFEable!(
3270     {
3271     foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
3272                           dchar*, const(dchar)*, immutable(dchar)*))
3273     {
3274         test!P("hello\U00010143\u0100\U00010143");
3275     }
3276     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3277                           dchar*, const(dchar)*, immutable(dchar)*))
3278     {
3279         test!P("hello\U00010143\u0100\U00010143"w);
3280     }
3281     foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3282                           wchar*, const(wchar)*, immutable(wchar)*))
3283     {
3284         test!P("hello\U00010143\u0100\U00010143"d);
3285     }
3286     foreach (S; AliasSeq!( char[], const( char)[],
3287                           wchar[], const(wchar)[],
3288                           dchar[], const(dchar)[]))
3289     {
3290         auto s = to!S("hello\U00010143\u0100\U00010143");
3291 
3292         foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
3293                               wchar*, const(wchar)*, immutable(wchar)*,
3294                               dchar*, const(dchar)*, immutable(dchar)*))
3295         {
3296             test!P(s);
3297         }
3298     }
3299     });
3300 }
3301 
3302 
3303 /++
3304     `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
3305 
3306     Encodes string `s` into UTF-16 and returns the encoded string.
3307     `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
3308     that take an `LPCWSTR` argument.
3309   +/
3310 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
3311 if (isSomeChar!C)
3312 {
3313     return toUTFz!(const(wchar)*)(str);
3314 }
3315 
3316 ///
3317 @system unittest
3318 {
3319     string str = "Hello, World!";
3320     const(wchar)* p = str.toUTF16z;
3321     assert(p[str.length] == '\0');
3322 }
3323 
3324 @safe pure unittest
3325 {
3326     import std.conv : to;
3327     //toUTFz is already thoroughly tested, so this will just verify that
3328     //toUTF16z compiles properly for the various string types.
3329     foreach (S; AliasSeq!(string, wstring, dstring))
3330         assert(toUTF16z(to!S("hello world")) !is null);
3331 }
3332 
3333 
3334 /* ================================ tests ================================== */
3335 
3336 @safe pure unittest
3337 {
3338     import std.exception;
3339 
3340     assertCTFEable!(
3341     {
3342     assert(toUTF16("hello"c) == "hello");
3343     assert(toUTF32("hello"c) == "hello");
3344     assert(toUTF8 ("hello"w) == "hello");
3345     assert(toUTF32("hello"w) == "hello");
3346     assert(toUTF8 ("hello"d) == "hello");
3347     assert(toUTF16("hello"d) == "hello");
3348 
3349     assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
3350     assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
3351     assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
3352     assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
3353     assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
3354     assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
3355 
3356     assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3357     assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
3358     assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3359     assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
3360     assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3361     assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
3362     });
3363 }
3364 
3365 
3366 /++
3367     Returns the total number of code points encoded in `str`.
3368 
3369     Supercedes: This function supercedes $(LREF toUCSindex).
3370 
3371     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3372 
3373     Throws:
3374         `UTFException` if `str` is not well-formed.
3375   +/
3376 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc
3377 if (isSomeChar!C)
3378 {
3379     return walkLength(str.byDchar);
3380 }
3381 
3382 ///
3383 @safe pure nothrow @nogc unittest
3384 {
3385     assert(count("") == 0);
3386     assert(count("a") == 1);
3387     assert(count("abc") == 3);
3388     assert(count("\u20AC100") == 4);
3389 }
3390 
3391 @safe pure nothrow @nogc unittest
3392 {
3393     import std.exception;
3394     assertCTFEable!(
3395     {
3396     assert(count("") == 0);
3397     assert(count("a") == 1);
3398     assert(count("abc") == 3);
3399     assert(count("\u20AC100") == 4);
3400     });
3401 }
3402 
3403 
3404 // Ranges of code units for testing.
3405 version (StdUnittest)
3406 {
3407 private:
3408     struct InputCU(C)
3409     {
3410         import std.conv : to;
3411         @property bool empty() { return _str.empty; }
3412         @property C front() { return _str[0]; }
3413         void popFront() { _str = _str[1 .. $]; }
3414 
3415         this(inout(C)[] str)
3416         {
3417             _str = to!(C[])(str);
3418         }
3419 
3420         C[] _str;
3421     }
3422 
3423     struct BidirCU(C)
3424     {
3425         import std.conv : to;
3426         @property bool empty() { return _str.empty; }
3427         @property C front() { return _str[0]; }
3428         void popFront() { _str = _str[1 .. $]; }
3429         @property C back() { return _str[$ - 1]; }
3430         void popBack() { _str = _str[0 .. $ - 1]; }
3431         @property auto save() { return BidirCU(_str); }
3432         @property size_t length() { return _str.length; }
3433 
3434         this(inout(C)[] str)
3435         {
3436             _str = to!(C[])(str);
3437         }
3438 
3439         C[] _str;
3440     }
3441 
3442     struct RandomCU(C)
3443     {
3444         import std.conv : to;
3445         @property bool empty() { return _str.empty; }
3446         @property C front() { return _str[0]; }
3447         void popFront() { _str = _str[1 .. $]; }
3448         @property C back() { return _str[$ - 1]; }
3449         void popBack() { _str = _str[0 .. $ - 1]; }
3450         @property auto save() { return RandomCU(_str); }
3451         @property size_t length() { return _str.length; }
3452         C opIndex(size_t i) { return _str[i]; }
3453         auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
3454 
3455         this(inout(C)[] str)
3456         {
3457             _str = to!(C[])(str);
3458         }
3459 
3460         C[] _str;
3461     }
3462 
3463     class RefBidirCU(C)
3464     {
3465         import std.conv : to;
3466         @property bool empty() { return _str.empty; }
3467         @property C front() { return _str[0]; }
3468         void popFront() { _str = _str[1 .. $]; }
3469         @property C back() { return _str[$ - 1]; }
3470         void popBack() { _str = _str[0 .. $ - 1]; }
3471         @property auto save() { return new RefBidirCU(_str); }
3472         @property size_t length() { return _str.length; }
3473 
3474         this(inout(C)[] str)
3475         {
3476             _str = to!(C[])(str);
3477         }
3478 
3479         C[] _str;
3480     }
3481 
3482     class RefRandomCU(C)
3483     {
3484         import std.conv : to;
3485         @property bool empty() { return _str.empty; }
3486         @property C front() { return _str[0]; }
3487         void popFront() { _str = _str[1 .. $]; }
3488         @property C back() { return _str[$ - 1]; }
3489         void popBack() { _str = _str[0 .. $ - 1]; }
3490         @property auto save() { return new RefRandomCU(_str); }
3491         @property size_t length() { return _str.length; }
3492         C opIndex(size_t i) { return _str[i]; }
3493         auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
3494 
3495         this(inout(C)[] str)
3496         {
3497             _str = to!(C[])(str);
3498         }
3499 
3500         C[] _str;
3501     }
3502 }
3503 
3504 
3505 /**
3506  * Inserted in place of invalid UTF sequences.
3507  *
3508  * References:
3509  *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
3510  */
3511 enum dchar replacementDchar = '\uFFFD';
3512 
3513 /********************************************
3514  * Iterate a range of char, wchar, or dchars by code unit.
3515  *
3516  * The purpose is to bypass the special case decoding that
3517  * $(REF front, std,range,primitives) does to character arrays. As a result,
3518  * using ranges with `byCodeUnit` can be `nothrow` while
3519  * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
3520  * sequences.
3521  *
3522  * A code unit is a building block of the UTF encodings. Generally, an
3523  * individual code unit does not represent what's perceived as a full
3524  * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
3525  * are encoded with multiple code units. For example, the UTF-8 code units for
3526  * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
3527  * often does not form a character on its own. Attempting to treat it as
3528  * one while iterating over the resulting range will give nonsensical results.
3529  *
3530  * Params:
3531  *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3532  *      of characters (including strings) or a type that implicitly converts to a string type.
3533  * Returns:
3534  *      If `r` is not an auto-decodable string (i.e. a narrow string or a
3535  *      user-defined type that implicits converts to a string type), then `r`
3536  *      is returned.
3537  *
3538  *      Otherwise, `r` is converted to its corresponding string type (if it's
3539  *      not already a string) and wrapped in a random-access range where the
3540  *      element encoding type of the string (its code unit) is the element type
3541  *      of the range, and that range returned. The range has slicing.
3542  *
3543  *      If `r` is quirky enough to be a struct or class which is an input range
3544  *      of characters on its own (i.e. it has the input range API as member
3545  *      functions), $(I and) it's implicitly convertible to a string type, then
3546  *      `r` is returned, and no implicit conversion takes place.
3547  *
3548  *      If `r` is wrapped in a new range, then that range has a `source`
3549  *      property for returning the string that's currently contained within that
3550  *      range.
3551  *
3552  * See_Also:
3553  *      Refer to the $(MREF std, uni) docs for a reference on Unicode
3554  *      terminology.
3555  *
3556  *      For a range that iterates by grapheme cluster (written character) see
3557  *      $(REF byGrapheme, std,uni).
3558  */
3559 auto byCodeUnit(R)(R r)
3560 if ((isConvertibleToString!R && !isStaticArray!R) ||
3561     (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
3562 {
3563     import std.traits : StringTypeOf;
3564     static if (// This would be cleaner if we had a way to check whether a type
3565                // was a range without any implicit conversions.
3566                (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
3567                 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3568     {
3569         static struct ByCodeUnitImpl
3570         {
3571         @safe pure nothrow @nogc:
3572 
3573             @property bool empty() const     { return source.length == 0; }
3574             @property auto ref front() inout { return source[0]; }
3575             void popFront()                  { source = source[1 .. $]; }
3576 
3577             @property auto save() { return ByCodeUnitImpl(source.save); }
3578 
3579             @property auto ref back() inout { return source[$ - 1]; }
3580             void popBack()                  { source = source[0 .. $-1]; }
3581 
3582             auto ref opIndex(size_t index) inout     { return source[index]; }
3583             auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
3584 
3585             @property size_t length() const { return source.length; }
3586             alias opDollar = length;
3587 
3588             StringTypeOf!R source;
3589         }
3590 
3591         static assert(isRandomAccessRange!ByCodeUnitImpl);
3592 
3593         return ByCodeUnitImpl(r);
3594     }
3595     else static if (!isInputRange!R ||
3596                     (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
3597                     !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
3598     {
3599         return cast(StringTypeOf!R) r;
3600     }
3601     else
3602     {
3603         // byCodeUnit for ranges and dchar[] is a no-op
3604         return r;
3605     }
3606 }
3607 
3608 ///
3609 @safe unittest
3610 {
3611     import std.range.primitives;
3612     import std.traits : isAutodecodableString;
3613 
3614     auto r = "Hello, World!".byCodeUnit();
3615     static assert(hasLength!(typeof(r)));
3616     static assert(hasSlicing!(typeof(r)));
3617     static assert(isRandomAccessRange!(typeof(r)));
3618     static assert(is(ElementType!(typeof(r)) == immutable char));
3619 
3620     // contrast with the range capabilities of standard strings (with or
3621     // without autodecoding enabled).
3622     auto s = "Hello, World!";
3623     static assert(isBidirectionalRange!(typeof(r)));
3624     static if (isAutodecodableString!(typeof(s)))
3625     {
3626         // with autodecoding enabled, strings are non-random-access ranges of
3627         // dchar.
3628         static assert(is(ElementType!(typeof(s)) == dchar));
3629         static assert(!isRandomAccessRange!(typeof(s)));
3630         static assert(!hasSlicing!(typeof(s)));
3631         static assert(!hasLength!(typeof(s)));
3632     }
3633     else
3634     {
3635         // without autodecoding, strings are normal arrays.
3636         static assert(is(ElementType!(typeof(s)) == immutable char));
3637         static assert(isRandomAccessRange!(typeof(s)));
3638         static assert(hasSlicing!(typeof(s)));
3639         static assert(hasLength!(typeof(s)));
3640     }
3641 }
3642 
3643 /// `byCodeUnit` does no Unicode decoding
3644 @safe unittest
3645 {
3646     string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
3647     assert(noel1.byCodeUnit[2] != 'ë');
3648     assert(noel1.byCodeUnit[2] == 'e');
3649 
3650     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3651     // Because string is UTF-8, the code unit at index 2 is just
3652     // the first of a sequence that encodes 'ë'
3653     assert(noel2.byCodeUnit[2] != 'ë');
3654 }
3655 
3656 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
3657 @safe unittest
3658 {
3659     import std.algorithm.comparison : equal;
3660     import std.range : popFrontN;
3661     import std.traits : isAutodecodableString;
3662     {
3663         auto range = byCodeUnit("hello world");
3664         range.popFrontN(3);
3665         assert(equal(range.save, "lo world"));
3666         static if (isAutodecodableString!string) // only enabled with autodecoding
3667         {
3668             string str = range.source;
3669             assert(str == "lo world");
3670         }
3671     }
3672     // source only exists if the range was wrapped
3673     {
3674         auto range = byCodeUnit("hello world"d);
3675         static assert(!__traits(compiles, range.source));
3676     }
3677 }
3678 
3679 @safe pure nothrow @nogc unittest
3680 {
3681     import std.range;
3682     {
3683         enum testStr = "𐁄𐂌𐃯 hello ディラン";
3684         char[testStr.length] s;
3685         int i;
3686         foreach (c; testStr.byCodeUnit().byCodeUnit())
3687         {
3688             s[i++] = c;
3689         }
3690         assert(s == testStr);
3691     }
3692     {
3693         enum testStr = "𐁄𐂌𐃯 hello ディラン"w;
3694         wchar[testStr.length] s;
3695         int i;
3696         foreach (c; testStr.byCodeUnit().byCodeUnit())
3697         {
3698             s[i++] = c;
3699         }
3700         assert(s == testStr);
3701     }
3702     {
3703         enum testStr = "𐁄𐂌𐃯 hello ディラン"d;
3704         dchar[testStr.length] s;
3705         int i;
3706         foreach (c; testStr.byCodeUnit().byCodeUnit())
3707         {
3708             s[i++] = c;
3709         }
3710         assert(s == testStr);
3711     }
3712     {
3713         auto bcu = "hello".byCodeUnit();
3714         assert(bcu.length == 5);
3715         assert(bcu[3] == 'l');
3716         assert(bcu[2 .. 4][1] == 'l');
3717     }
3718     {
3719         char[5] orig = "hello";
3720         auto bcu = orig[].byCodeUnit();
3721         bcu.front = 'H';
3722         assert(bcu.front == 'H');
3723         bcu[1] = 'E';
3724         assert(bcu[1] == 'E');
3725     }
3726     {
3727         auto bcu = "hello".byCodeUnit().byCodeUnit();
3728         static assert(isForwardRange!(typeof(bcu)));
3729         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3730         auto s = bcu.save;
3731         bcu.popFront();
3732         assert(s.front == 'h');
3733     }
3734     {
3735         auto bcu = "hello".byCodeUnit();
3736         static assert(hasSlicing!(typeof(bcu)));
3737         static assert(isBidirectionalRange!(typeof(bcu)));
3738         static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
3739         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3740         auto ret = bcu.retro;
3741         assert(ret.front == 'o');
3742         ret.popFront();
3743         assert(ret.front == 'l');
3744     }
3745     {
3746         auto bcu = "κόσμε"w.byCodeUnit();
3747         static assert(hasSlicing!(typeof(bcu)));
3748         static assert(isBidirectionalRange!(typeof(bcu)));
3749         static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
3750         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3751         auto ret = bcu.retro;
3752         assert(ret.front == 'ε');
3753         ret.popFront();
3754         assert(ret.front == 'μ');
3755     }
3756     {
3757         static struct Stringish
3758         {
3759             string s;
3760             alias s this;
3761         }
3762 
3763         auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓");
3764         auto bcu = orig.byCodeUnit();
3765         static assert(is(typeof(bcu) == struct));
3766         static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
3767         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3768         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3769         assert(bcu.front == cast(char) 244);
3770     }
3771     {
3772         static struct WStringish
3773         {
3774             wstring s;
3775             alias s this;
3776         }
3777 
3778         auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w);
3779         auto bcu = orig.byCodeUnit();
3780         static assert(is(typeof(bcu) == struct));
3781         static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
3782         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3783         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3784         assert(bcu.front == cast(wchar) 56319);
3785     }
3786     {
3787         static struct DStringish
3788         {
3789             dstring s;
3790             alias s this;
3791         }
3792 
3793         auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d);
3794         auto bcu = orig.byCodeUnit();
3795         static assert(is(typeof(bcu) == dstring));
3796         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3797         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3798         assert(bcu.front == cast(dchar) 1114104);
3799     }
3800     {
3801         static struct FuncStringish
3802         {
3803             string str;
3804             string s() pure nothrow @nogc { return str; }
3805             alias s this;
3806         }
3807 
3808         auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓");
3809         auto bcu = orig.byCodeUnit();
3810         static if (isAutodecodableString!FuncStringish)
3811             static assert(is(typeof(bcu) == struct));
3812         else
3813             static assert(is(typeof(bcu) == string));
3814         static assert(!is(typeof(bcu) == FuncStringish));
3815         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3816         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3817         assert(bcu.front == cast(char) 244);
3818     }
3819     {
3820         static struct Range
3821         {
3822             string data;
3823             bool empty() pure nothrow @nogc { return data.empty; }
3824             char front() pure nothrow @nogc { return data[0]; }
3825             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3826         }
3827 
3828         auto orig = Range("\U0010fff8 𐁊 foo 𐂓");
3829         auto bcu = orig.byCodeUnit();
3830         static assert(is(typeof(bcu) == Range));
3831         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3832         static assert(is(ElementType!(typeof(bcu)) == char));
3833         assert(bcu.front == cast(char) 244);
3834     }
3835     {
3836         static struct WRange
3837         {
3838             wstring data;
3839             bool empty() pure nothrow @nogc { return data.empty; }
3840             wchar front() pure nothrow @nogc { return data[0]; }
3841             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3842         }
3843 
3844         auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w);
3845         auto bcu = orig.byCodeUnit();
3846         static assert(is(typeof(bcu) == WRange));
3847         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3848         static assert(is(ElementType!(typeof(bcu)) == wchar));
3849         assert(bcu.front == 56319);
3850     }
3851     {
3852         static struct DRange
3853         {
3854             dstring data;
3855             bool empty() pure nothrow @nogc { return data.empty; }
3856             dchar front() pure nothrow @nogc { return data[0]; }
3857             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3858         }
3859 
3860         auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d);
3861         auto bcu = orig.byCodeUnit();
3862         static assert(is(typeof(bcu) == DRange));
3863         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3864         static assert(is(ElementType!(typeof(bcu)) == dchar));
3865         assert(bcu.front == 1114104);
3866     }
3867     {
3868         static struct RangeAndStringish
3869         {
3870             bool empty() pure nothrow @nogc { return data.empty; }
3871             char front() pure nothrow @nogc { return data[0]; }
3872             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3873 
3874             string data;
3875             string s;
3876             alias s this;
3877         }
3878 
3879         auto orig = RangeAndStringish("test.d", "other");
3880         auto bcu = orig.byCodeUnit();
3881         static assert(is(typeof(bcu) == RangeAndStringish));
3882         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3883         static assert(is(ElementType!(typeof(bcu)) == char));
3884         assert(bcu.front == 't');
3885     }
3886     {
3887         static struct WRangeAndStringish
3888         {
3889             bool empty() pure nothrow @nogc { return data.empty; }
3890             wchar front() pure nothrow @nogc { return data[0]; }
3891             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3892 
3893             wstring data;
3894             wstring s;
3895             alias s this;
3896         }
3897 
3898         auto orig = WRangeAndStringish("test.d"w, "other"w);
3899         auto bcu = orig.byCodeUnit();
3900         static assert(is(typeof(bcu) == WRangeAndStringish));
3901         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3902         static assert(is(ElementType!(typeof(bcu)) == wchar));
3903         assert(bcu.front == 't');
3904     }
3905     {
3906         static struct DRangeAndStringish
3907         {
3908             bool empty() pure nothrow @nogc { return data.empty; }
3909             dchar front() pure nothrow @nogc { return data[0]; }
3910             void popFront() pure nothrow @nogc { data = data[1 .. $]; }
3911 
3912             dstring data;
3913             dstring s;
3914             alias s this;
3915         }
3916 
3917         auto orig = DRangeAndStringish("test.d"d, "other"d);
3918         auto bcu = orig.byCodeUnit();
3919         static assert(is(typeof(bcu) == DRangeAndStringish));
3920         static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
3921         static assert(is(ElementType!(typeof(bcu)) == dchar));
3922         assert(bcu.front == 't');
3923     }
3924     {
3925         enum Enum : string { a = "test.d" }
3926 
3927         auto orig = Enum.a;
3928         auto bcu = orig.byCodeUnit();
3929         static assert(!is(typeof(bcu) == Enum));
3930         static if (isAutodecodableString!Enum)
3931             static assert(is(typeof(bcu) == struct));
3932         else
3933             static assert(is(typeof(bcu) == string));
3934         static assert(is(ElementType!(typeof(bcu)) == immutable char));
3935         assert(bcu.front == 't');
3936     }
3937     {
3938         enum WEnum : wstring { a = "test.d"w }
3939 
3940         auto orig = WEnum.a;
3941         auto bcu = orig.byCodeUnit();
3942         static assert(!is(typeof(bcu) == WEnum));
3943         static if (isAutodecodableString!WEnum)
3944             static assert(is(typeof(bcu) == struct));
3945         else
3946             static assert(is(typeof(bcu) == wstring));
3947         static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
3948         assert(bcu.front == 't');
3949     }
3950     {
3951         enum DEnum : dstring { a = "test.d"d }
3952 
3953         auto orig = DEnum.a;
3954         auto bcu = orig.byCodeUnit();
3955         static assert(is(typeof(bcu) == dstring));
3956         static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
3957         assert(bcu.front == 't');
3958     }
3959 
3960     static if (autodecodeStrings)
3961     {
3962         static assert(!is(typeof(byCodeUnit("hello")) == string));
3963         static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
3964     }
3965     else
3966     {
3967         static assert(is(typeof(byCodeUnit("hello")) == string));
3968         static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
3969     }
3970     static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
3971 
3972     static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
3973     static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
3974     static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
3975 
3976     enum SEnum : char[5] { a = "hello" }
3977     enum WSEnum : wchar[5] { a = "hello"w }
3978     enum DSEnum : dchar[5] { a = "hello"d }
3979 
3980     static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
3981     static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
3982     static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
3983 }
3984 
3985 /****************************
3986  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
3987  * of characters by char, wchar, or dchar.
3988  * These aliases simply forward to $(LREF byUTF) with the
3989  * corresponding C argument.
3990  *
3991  * Params:
3992  *      r = input range of characters, or array of characters
3993  */
3994 alias byChar = byUTF!char;
3995 
3996 /// Ditto
3997 alias byWchar = byUTF!wchar;
3998 
3999 /// Ditto
4000 alias byDchar = byUTF!dchar;
4001 
4002 @safe pure nothrow @nogc unittest
4003 {
4004   {
4005     char[5] s;
4006     int i;
4007     foreach (c; "hello".byChar.byChar())
4008     {
4009         //writefln("[%d] '%c'", i, c);
4010         s[i++] = c;
4011     }
4012     assert(s == "hello");
4013   }
4014   {
4015     char[5+2+3+4+3+3] s;
4016     int i;
4017     dchar[10] a;
4018     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4019     a[8] = 0xD800;   // invalid
4020     a[9] = cast(dchar) 0x110000; // invalid
4021     foreach (c; a[].byChar())
4022     {
4023         //writefln("[%d] '%c'", i, c);
4024         s[i++] = c;
4025     }
4026     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
4027   }
4028   {
4029     auto r = "hello"w.byChar();
4030     r.popFront();
4031     r.popFront();
4032     assert(r.front == 'l');
4033   }
4034   {
4035     auto r = "hello"d.byChar();
4036     r.popFront();
4037     r.popFront();
4038     assert(r.front == 'l');
4039   }
4040   {
4041     auto r = "hello"d.byChar();
4042     assert(isForwardRange!(typeof(r)));
4043     auto s = r.save;
4044     r.popFront();
4045     assert(s.front == 'h');
4046   }
4047 }
4048 
4049 @safe pure nothrow @nogc unittest
4050 {
4051   {
4052     wchar[11] s;
4053     int i;
4054     dchar[10] a;
4055     a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
4056     a[8] = 0xD800;   // invalid
4057     a[9] = cast(dchar) 0x110000; // invalid
4058     foreach (c; a[].byWchar())
4059     {
4060         //writefln("[%d] '%c' x%x", i, c, c);
4061         s[i++] = c;
4062     }
4063     foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
4064     {
4065         //writefln("[%d] '%c' x%x", j, c, c);
4066     }
4067     assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
4068   }
4069 
4070   {
4071     auto r = "hello".byWchar();
4072     r.popFront();
4073     r.popFront();
4074     assert(r.front == 'l');
4075   }
4076   {
4077     auto r = "hello"d.byWchar();
4078     r.popFront();
4079     r.popFront();
4080     assert(r.front == 'l');
4081   }
4082   {
4083     auto r = "hello"d.byWchar();
4084     assert(isForwardRange!(typeof(r)));
4085     auto s = r.save;
4086     r.popFront();
4087     assert(s.front == 'h');
4088   }
4089 }
4090 
4091 @safe pure nothrow @nogc unittest
4092 {
4093   {
4094     dchar[9] s;
4095     int i;
4096     string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
4097     foreach (c; a.byDchar())
4098     {
4099         s[i++] = c;
4100     }
4101     assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
4102   }
4103   {
4104     foreach (s; invalidUTFstrings!char())
4105     {
4106         auto r = s.byDchar();
4107         assert(!r.empty);
4108         assert(r.front == r.front);
4109         dchar c = r.front;
4110         assert(c == replacementDchar);
4111     }
4112   }
4113   {
4114     auto r = "hello".byDchar();
4115     r.popFront();
4116     r.popFront();
4117     assert(r.front == 'l');
4118   }
4119 
4120   {
4121     dchar[8] s;
4122     int i;
4123     wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
4124     foreach (c; a.byDchar())
4125     {
4126         //writefln("[%d] '%c' x%x", i, c, c);
4127         s[i++] = c;
4128     }
4129     assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
4130   }
4131   {
4132     foreach (s; invalidUTFstrings!wchar())
4133     {
4134         auto r = s.byDchar();
4135         assert(!r.empty);
4136         assert(r.front == r.front);
4137         dchar c = r.front;
4138         assert(c == replacementDchar);
4139     }
4140   }
4141   {
4142     wchar[2] ws;
4143     ws[0] = 0xD800;
4144     ws[1] = 0xDD00;             // correct surrogate pair
4145     auto r = ws[].byDchar();
4146     assert(!r.empty);
4147     assert(r.front == r.front);
4148     dchar c = r.front;
4149     assert(c == '\U00010100');
4150   }
4151   {
4152     auto r = "hello"w.byDchar();
4153     r.popFront();
4154     r.popFront();
4155     assert(r.front == 'l');
4156   }
4157 
4158   {
4159     dchar[5] s;
4160     int i;
4161     dstring a = "hello"d;
4162     foreach (c; a.byDchar.byDchar())
4163     {
4164         //writefln("[%d] '%c' x%x", i, c, c);
4165         s[i++] = c;
4166     }
4167     assert(s == "hello"d);
4168   }
4169   {
4170     auto r = "hello".byDchar();
4171     assert(isForwardRange!(typeof(r)));
4172     auto s = r.save;
4173     r.popFront();
4174     assert(s.front == 'h');
4175   }
4176   {
4177     auto r = "hello"w.byDchar();
4178     assert(isForwardRange!(typeof(r)));
4179     auto s = r.save;
4180     r.popFront();
4181     assert(s.front == 'h');
4182   }
4183 }
4184 
4185 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
4186 // which needs to support ranges with and without those attributes
4187 
4188 pure @safe nothrow @nogc unittest
4189 {
4190     dchar[5] s = "hello"d;
4191     foreach (c; s[].byChar())  { }
4192     foreach (c; s[].byWchar()) { }
4193     foreach (c; s[].byDchar()) { }
4194 }
4195 
4196 version (StdUnittest)
4197 private int impureVariable;
4198 
4199 @system unittest
4200 {
4201     static struct ImpureThrowingSystemRange(Char)
4202     {
4203         @property bool empty() const { return true; }
4204         @property Char front() const { return Char.init; }
4205         void popFront()
4206         {
4207             impureVariable++;
4208             throw new Exception("only for testing nothrow");
4209         }
4210     }
4211 
4212     foreach (Char; AliasSeq!(char, wchar, dchar))
4213     {
4214         ImpureThrowingSystemRange!Char range;
4215         foreach (c; range.byChar())  { }
4216         foreach (c; range.byWchar()) { }
4217         foreach (c; range.byDchar()) { }
4218     }
4219 }
4220 
4221 /****************************
4222  * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
4223  * of characters by char type `C` by encoding the elements of the range.
4224  *
4225  * UTF sequences that cannot be converted to the specified encoding are either
4226  * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
4227  * of the Unicode Standard 6.2 or result in a thrown UTFException.
4228  *  Hence byUTF is not symmetric.
4229  * This algorithm is lazy, and does not allocate memory.
4230  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
4231  * `r` parameter.
4232  *
4233  * Params:
4234  *      C = `char`, `wchar`, or `dchar`
4235  *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
4236  *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
4237  *
4238  * Throws:
4239  *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
4240  *
4241  * GC:
4242  *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
4243  *
4244  * Returns:
4245  *      A forward range if `R` is a range and not auto-decodable, as defined by
4246  *      $(REF isAutodecodableString, std, traits), and if the base range is
4247  *      also a forward range.
4248  *
4249  *      Or, if `R` is a range and it is auto-decodable and
4250  *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
4251  *      to $(LREF byCodeUnit).
4252  *
4253  *      Otherwise, an input range of characters.
4254  */
4255 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
4256 if (isSomeChar!C)
4257 {
4258     static if (!is(Unqual!C == C))
4259         alias byUTF = byUTF!(Unqual!C);
4260     else:
4261 
4262     auto ref byUTF(R)(R r)
4263         if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4264     {
4265         return byUTF(r.byCodeUnit());
4266     }
4267 
4268     auto ref byUTF(R)(R r)
4269         if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
4270     {
4271         alias RC = Unqual!(ElementEncodingType!R);
4272 
4273         static if (is(RC == C))
4274         {
4275             return r.byCodeUnit();
4276         }
4277         else static if (is(C == dchar))
4278         {
4279             static struct Result
4280             {
4281                 enum Empty = uint.max;  // range is empty or just constructed
4282 
4283                 this(return R r)
4284                 {
4285                     this.r = r;
4286                 }
4287 
4288                 this(return R r, uint buff)
4289                 {
4290                     this.r = r;
4291                     this.buff = buff;
4292                 }
4293 
4294 
4295                 @property bool empty()
4296                 {
4297                     return buff == Empty && r.empty;
4298                 }
4299 
4300                 @property dchar front() scope // 'scope' required by call to decodeFront() below
4301                 {
4302                     if (buff == Empty)
4303                     {
4304                         auto c = r.front;
4305 
4306                         static if (is(RC == wchar))
4307                             enum firstMulti = 0xD800; // First high surrogate.
4308                         else
4309                             enum firstMulti = 0x80; // First non-ASCII.
4310                         if (c < firstMulti)
4311                         {
4312                             r.popFront;
4313                             buff = cast(dchar) c;
4314                         }
4315                         else
4316                         {
4317                             buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4318                         }
4319                     }
4320                     return cast(dchar) buff;
4321                 }
4322 
4323                 void popFront()
4324                 {
4325                     if (buff == Empty)
4326                         front();
4327                     buff = Empty;
4328                 }
4329 
4330                 static if (isForwardRange!R)
4331                 {
4332                     @property auto save()
4333                     {
4334                         return Result(r.save, buff);
4335                     }
4336                 }
4337 
4338             private:
4339 
4340                 R r;
4341                 uint buff = Empty;      // one character lookahead buffer
4342             }
4343 
4344             return Result(r);
4345         }
4346         else
4347         {
4348             static struct Result
4349             {
4350                 this(return R r)
4351                 {
4352                     this.r = r;
4353                 }
4354 
4355                 this(return R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
4356                 {
4357                     this.r = r;
4358                     this.pos = pos;
4359                     this.fill = fill;
4360                     this.buf = buf;
4361                 }
4362 
4363                 @property bool empty()
4364                 {
4365                     return pos == fill && r.empty;
4366                 }
4367 
4368                 @property auto front() scope // 'scope' required by call to decodeFront() below
4369                 {
4370                     if (pos == fill)
4371                     {
4372                         pos = 0;
4373                         auto c = r.front;
4374 
4375                         static if (C.sizeof >= 2 && RC.sizeof >= 2)
4376                             enum firstMulti = 0xD800; // First high surrogate.
4377                         else
4378                             enum firstMulti = 0x80; // First non-ASCII.
4379                         if (c < firstMulti)
4380                         {
4381                             fill = 1;
4382                             r.popFront;
4383                             buf[pos] = cast(C) c;
4384                         }
4385                         else
4386                         {
4387                             static if (is(RC == dchar))
4388                             {
4389                                 r.popFront;
4390                                 dchar dc = c;
4391                             }
4392                             else
4393                                 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
4394                             fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
4395                         }
4396                     }
4397                     return buf[pos];
4398                 }
4399 
4400                 void popFront()
4401                 {
4402                     if (pos == fill)
4403                         front;
4404                     ++pos;
4405                 }
4406 
4407                 static if (isForwardRange!R)
4408                 {
4409                     @property auto save()
4410                     {
4411                         return Result(r.save, pos, fill, buf);
4412                     }
4413                 }
4414 
4415             private:
4416 
4417                 R r;
4418                 ushort pos, fill;
4419                 C[4 / C.sizeof] buf = void;
4420             }
4421 
4422             return Result(r);
4423         }
4424     }
4425 }
4426 
4427 ///
4428 @safe pure nothrow unittest
4429 {
4430     import std.algorithm.comparison : equal;
4431 
4432     // hellö as a range of `char`s, which are UTF-8
4433     assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
4434 
4435     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4436     assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
4437 
4438     // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32
4439     assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
4440     assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37]));
4441     assert("𐐷".byUTF!dchar().equal([0x00010437]));
4442 }
4443 
4444 ///
4445 @safe unittest
4446 {
4447     import std.algorithm.comparison : equal;
4448     import std.exception : assertThrown;
4449 
4450     assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
4451     assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
4452 }