std.uni source code

1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as a “character”
265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 debug(std_uni) import std.stdio; // writefln, writeln
716 
717 private:
718 
719 
720 void copyBackwards(T,U)(T[] src, U[] dest)
721 {
722     assert(src.length == dest.length);
723     for (size_t i=src.length; i-- > 0; )
724         dest[i] = src[i];
725 }
726 
727 void copyForward(T,U)(T[] src, U[] dest)
728 {
729     assert(src.length == dest.length);
730     for (size_t i=0; i<src.length; i++)
731         dest[i] = src[i];
732 }
733 
734 // TODO: update to reflect all major CPUs supporting unaligned reads
735 version (X86)
736     enum hasUnalignedReads = true;
737 else version (X86_64)
738     enum hasUnalignedReads = true;
739 else version (SystemZ)
740     enum hasUnalignedReads = true;
741 else
742     enum hasUnalignedReads = false; // better be safe then sorry
743 
744 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
745 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
746 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
747 
748 // test the intro example
749 @safe unittest
750 {
751     import std.algorithm.searching : find;
752     // initialize code point sets using script/block or property name
753     // set contains code points from both scripts.
754     auto set = unicode("Cyrillic") | unicode("Armenian");
755     // or simpler and statically-checked look
756     auto ascii = unicode.ASCII;
757     auto currency = unicode.Currency_Symbol;
758 
759     // easy set ops
760     auto a = set & ascii;
761     assert(a.empty); // as it has no intersection with ascii
762     a = set | ascii;
763     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
764 
765     // some properties of code point sets
766     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
767     // testing presence of a code point in a set
768     // is just fine, it is O(logN)
769     assert(!b['$']);
770     assert(!b['\u058F']); // Armenian dram sign
771     assert(b['¥']);
772 
773     // building fast lookup tables, these guarantee O(1) complexity
774     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
775     auto oneTrie = toTrie!1(b);
776     // 2-level far more compact but typically slightly slower
777     auto twoTrie = toTrie!2(b);
778     // 3-level even smaller, and a bit slower yet
779     auto threeTrie = toTrie!3(b);
780     assert(oneTrie['£']);
781     assert(twoTrie['£']);
782     assert(threeTrie['£']);
783 
784     // build the trie with the most sensible trie level
785     // and bind it as a functor
786     auto cyrillicOrArmenian = toDelegate(set);
787     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
788     assert(balance == "ընկեր!");
789     // compatible with bool delegate(dchar)
790     bool delegate(dchar) bindIt = cyrillicOrArmenian;
791 
792     // Normalization
793     string s = "Plain ascii (and not only), is always normalized!";
794     assert(s is normalize(s));// is the same string
795 
796     string nonS = "A\u0308ffin"; // A ligature
797     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
798     assert(nS == "Äffin");
799     assert(nS != nonS);
800     string composed = "Äffin";
801 
802     assert(normalize!NFD(composed) == "A\u0308ffin");
803     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
804     assert(normalize!NFKD("2¹⁰") == "210");
805 }
806 
807 enum lastDchar = 0x10FFFF;
808 
809 auto force(T, F)(F from)
810 if (isIntegral!T && !is(T == F))
811 {
812     assert(from <= T.max && from >= T.min);
813     return cast(T) from;
814 }
815 
816 auto force(T, F)(F from)
817 if (isBitPacked!T && !is(T == F))
818 {
819     assert(from <= 2^^bitSizeOf!T-1);
820     return T(cast(TypeOfBitPacked!T) from);
821 }
822 
823 auto force(T, F)(F from)
824 if (is(T == F))
825 {
826     return from;
827 }
828 
829 // repeat X times the bit-pattern in val assuming it's length is 'bits'
830 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
831 {
832     static if (times == 1)
833         return val;
834     else static if (bits == 1)
835     {
836         static if (times == size_t.sizeof*8)
837             return val ? size_t.max : 0;
838         else
839             return val ? (1 << times)-1 : 0;
840     }
841     else static if (times % 2)
842         return (replicateBits!(times-1, bits)(val)<<bits) | val;
843     else
844         return replicateBits!(times/2, bits*2)((val << bits) | val);
845 }
846 
847 @safe pure nothrow @nogc unittest // for replicate
848 {
849     import std.algorithm.iteration : sum, map;
850     import std.range : iota;
851     size_t m = 0b111;
852     size_t m2 = 0b01;
853     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
854     {
855         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
856         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
857     }
858 }
859 
860 // multiple arrays squashed into one memory block
861 struct MultiArray(Types...)
862 {
863     import std.range.primitives : isOutputRange;
864     this(size_t[] sizes...) @safe pure nothrow
865     {
866         assert(dim == sizes.length);
867         size_t full_size;
868         foreach (i, v; Types)
869         {
870             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
871             sz[i] = sizes[i];
872             static if (i >= 1)
873                 offsets[i] = offsets[i-1] +
874                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
875         }
876 
877         storage = new size_t[full_size];
878     }
879 
880     this(const(size_t)[] raw_offsets,
881         const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc
882     {
883         offsets[] = raw_offsets[];
884         sz[] = raw_sizes[];
885         storage = data;
886     }
887 
888     @property auto slice(size_t n)()inout pure nothrow @nogc
889     {
890         auto ptr = raw_ptr!n;
891         return packedArrayView!(Types[n])(ptr, sz[n]);
892     }
893 
894     @property auto ptr(size_t n)()inout pure nothrow @nogc
895     {
896         auto ptr = raw_ptr!n;
897         return inout(PackedPtr!(Types[n]))(ptr);
898     }
899 
900     template length(size_t n)
901     {
902         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
903 
904         @property void length(size_t new_size)
905         {
906             if (new_size > sz[n])
907             {// extend
908                 size_t delta = (new_size - sz[n]);
909                 sz[n] += delta;
910                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
911                 storage.length +=  delta;// extend space at end
912                 // raw_slice!x must follow resize as it could be moved!
913                 // next stmts move all data past this array, last-one-goes-first
914                 static if (n != dim-1)
915                 {
916                     auto start = raw_ptr!(n+1);
917                     // len includes delta
918                     size_t len = (storage.ptr+storage.length-start);
919 
920                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
921 
922                     start[0 .. delta] = 0;
923                     // offsets are used for raw_slice, ptr etc.
924                     foreach (i; n+1 .. dim)
925                         offsets[i] += delta;
926                 }
927             }
928             else if (new_size < sz[n])
929             {// shrink
930                 size_t delta = (sz[n] - new_size);
931                 sz[n] -= delta;
932                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
933                 // move all data past this array, forward direction
934                 static if (n != dim-1)
935                 {
936                     auto start = raw_ptr!(n+1);
937                     size_t len = (storage.ptr+storage.length-start);
938                     copyForward(start[0 .. len-delta], start[delta .. len]);
939 
940                     // adjust offsets last, they affect raw_slice
941                     foreach (i; n+1 .. dim)
942                         offsets[i] -= delta;
943                 }
944                 storage.length -= delta;
945             }
946             // else - NOP
947         }
948     }
949 
950     @property size_t bytes(size_t n=size_t.max)() const @safe
951     {
952         static if (n == size_t.max)
953             return storage.length*size_t.sizeof;
954         else static if (n != Types.length-1)
955             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
956         else
957             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
958     }
959 
960     void store(OutRange)(scope OutRange sink) const
961         if (isOutputRange!(OutRange, char))
962     {
963         import std.format : formattedWrite;
964         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
965         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
966         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
967     }
968 
969 private:
970     import std.meta : staticMap;
971     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
972     {
973         static if (n == 0)
974             return storage.ptr;
975         else
976         {
977             return storage.ptr+offsets[n];
978         }
979     }
980     enum dim = Types.length;
981     size_t[dim] offsets;// offset for level x
982     size_t[dim] sz;// size of level x
983     alias bitWidth = staticMap!(bitSizeOf, Types);
984     size_t[] storage;
985 }
986 
987 @system unittest
988 {
989     import std.conv : text;
990     enum dg = (){
991         // sizes are:
992         // lvl0: 3, lvl1 : 2, lvl2: 1
993         auto m = MultiArray!(int, ubyte, int)(3,2,1);
994 
995         static void check(size_t k, T)(ref T m, int n)
996         {
997             foreach (i; 0 .. n)
998                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
999         }
1000 
1001         static void checkB(size_t k, T)(ref T m, int n)
1002         {
1003             foreach (i; 0 .. n)
1004                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1005         }
1006 
1007         static void fill(size_t k, T)(ref T m, int n)
1008         {
1009             foreach (i; 0 .. n)
1010                 m.slice!(k)[i] = force!ubyte(i+1);
1011         }
1012 
1013         static void fillB(size_t k, T)(ref T m, int n)
1014         {
1015             foreach (i; 0 .. n)
1016                 m.slice!(k)[i] = force!ubyte(n-i);
1017         }
1018 
1019         m.length!1 = 100;
1020         fill!1(m, 100);
1021         check!1(m, 100);
1022 
1023         m.length!0 = 220;
1024         fill!0(m, 220);
1025         check!1(m, 100);
1026         check!0(m, 220);
1027 
1028         m.length!2 = 17;
1029         fillB!2(m, 17);
1030         checkB!2(m, 17);
1031         check!0(m, 220);
1032         check!1(m, 100);
1033 
1034         m.length!2 = 33;
1035         checkB!2(m, 17);
1036         fillB!2(m, 33);
1037         checkB!2(m, 33);
1038         check!0(m, 220);
1039         check!1(m, 100);
1040 
1041         m.length!1 = 195;
1042         fillB!1(m, 195);
1043         checkB!1(m, 195);
1044         checkB!2(m, 33);
1045         check!0(m, 220);
1046 
1047         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1048         marr.length!0 = 15;
1049         marr.length!1 = 30;
1050         fill!1(marr, 30);
1051         fill!0(marr, 15);
1052         check!1(marr, 30);
1053         check!0(marr, 15);
1054         return 0;
1055     };
1056     enum ct = dg();
1057     auto rt = dg();
1058 }
1059 
1060 @system unittest
1061 {// more bitpacking tests
1062     import std.conv : text;
1063 
1064     alias Bitty =
1065       MultiArray!(BitPacked!(size_t, 3)
1066                 , BitPacked!(size_t, 4)
1067                 , BitPacked!(size_t, 3)
1068                 , BitPacked!(size_t, 6)
1069                 , bool);
1070     alias fn1 = sliceBits!(13, 16);
1071     alias fn2 = sliceBits!( 9, 13);
1072     alias fn3 = sliceBits!( 6,  9);
1073     alias fn4 = sliceBits!( 0,  6);
1074     static void check(size_t lvl, MA)(ref MA arr){
1075         for (size_t i = 0; i< arr.length!lvl; i++)
1076             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1077     }
1078 
1079     static void fillIdx(size_t lvl, MA)(ref MA arr){
1080         for (size_t i = 0; i< arr.length!lvl; i++)
1081             arr.slice!(lvl)[i] = i;
1082     }
1083     Bitty m1;
1084 
1085     m1.length!4 = 10;
1086     m1.length!3 = 2^^6;
1087     m1.length!2 = 2^^3;
1088     m1.length!1 = 2^^4;
1089     m1.length!0 = 2^^3;
1090 
1091     m1.length!4 = 2^^16;
1092 
1093     for (size_t i = 0; i< m1.length!4; i++)
1094         m1.slice!(4)[i] = i % 2;
1095 
1096     fillIdx!1(m1);
1097     check!1(m1);
1098     fillIdx!2(m1);
1099     check!2(m1);
1100     fillIdx!3(m1);
1101     check!3(m1);
1102     fillIdx!0(m1);
1103     check!0(m1);
1104     check!3(m1);
1105     check!2(m1);
1106     check!1(m1);
1107     for (size_t i=0; i < 2^^16; i++)
1108     {
1109         m1.slice!(4)[i] = i % 2;
1110         m1.slice!(0)[fn1(i)] = fn1(i);
1111         m1.slice!(1)[fn2(i)] = fn2(i);
1112         m1.slice!(2)[fn3(i)] = fn3(i);
1113         m1.slice!(3)[fn4(i)] = fn4(i);
1114     }
1115     for (size_t i=0; i < 2^^16; i++)
1116     {
1117         assert(m1.slice!(4)[i] == i % 2);
1118         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1119         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1120         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1121         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1122     }
1123 }
1124 
1125 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1126 {
1127     import std.math : nextPow2;
1128     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1129     static if (bits > 8*size_t.sizeof)
1130     {
1131         static assert(bits % (size_t.sizeof*8) == 0);
1132         return new_len * bits/(8*size_t.sizeof);
1133     }
1134     else
1135     {
1136         enum factor = size_t.sizeof*8/bits;
1137         return (new_len+factor-1)/factor; // rounded up
1138     }
1139 }
1140 
1141 template isBitPackableType(T)
1142 {
1143     enum isBitPackableType = isBitPacked!T
1144         || isIntegral!T || is(T == bool) || isSomeChar!T;
1145 }
1146 
1147 //============================================================================
1148 template PackedArrayView(T)
1149 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1150     && isBitPackableType!U) || isBitPackableType!T)
1151 {
1152     import std.math : nextPow2;
1153     private enum bits = bitSizeOf!T;
1154     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1155 }
1156 
1157 //unsafe and fast access to a chunk of RAM as if it contains packed values
1158 template PackedPtr(T)
1159 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1160     && isBitPackableType!U) || isBitPackableType!T)
1161 {
1162     import std.math : nextPow2;
1163     private enum bits = bitSizeOf!T;
1164     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1165 }
1166 
1167 struct PackedPtrImpl(T, size_t bits)
1168 {
1169 pure nothrow:
1170     static assert(isPow2OrZero(bits));
1171 
1172     this(inout(size_t)* ptr)inout @safe @nogc
1173     {
1174         origin = ptr;
1175     }
1176 
1177     private T simpleIndex(size_t n) inout
1178     {
1179         immutable q = n / factor;
1180         immutable r = n % factor;
1181         return cast(T)((origin[q] >> bits*r) & mask);
1182     }
1183 
1184     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1185     in
1186     {
1187         static if (isIntegral!T)
1188             assert(val <= mask);
1189     }
1190     do
1191     {
1192         immutable q = n / factor;
1193         immutable r = n % factor;
1194         immutable tgt_shift = bits*r;
1195         immutable word = origin[q];
1196         origin[q] = (word & ~(mask << tgt_shift))
1197             | (cast(size_t) val << tgt_shift);
1198     }
1199 
1200     static if (factor == bytesPerWord// can safely pack by byte
1201          || factor == 1 // a whole word at a time
1202          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1203                 && hasUnalignedReads)) // this needs unaligned reads
1204     {
1205         static if (factor == bytesPerWord)
1206             alias U = ubyte;
1207         else static if (factor == bytesPerWord/2)
1208             alias U = ushort;
1209         else static if (factor == bytesPerWord/4)
1210             alias U = uint;
1211         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1212             alias U = ulong;
1213 
1214         T opIndex(size_t idx) inout
1215         {
1216             T ret;
1217             version (LittleEndian)
1218                 ret = __ctfe ? simpleIndex(idx) :
1219                     cast(inout(T))(cast(U*) origin)[idx];
1220             else
1221                 ret = simpleIndex(idx);
1222             return ret;
1223         }
1224 
1225         static if (isBitPacked!T) // lack of user-defined implicit conversion
1226         {
1227             void opIndexAssign(T val, size_t idx)
1228             {
1229                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1230             }
1231         }
1232 
1233         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1234         {
1235             version (LittleEndian)
1236             {
1237                 if (__ctfe)
1238                     simpleWrite(val, idx);
1239                 else
1240                     (cast(U*) origin)[idx] = cast(U) val;
1241             }
1242             else
1243                 simpleWrite(val, idx);
1244         }
1245     }
1246     else
1247     {
1248         T opIndex(size_t n) inout
1249         {
1250             return simpleIndex(n);
1251         }
1252 
1253         static if (isBitPacked!T) // lack of user-defined implicit conversion
1254         {
1255             void opIndexAssign(T val, size_t idx)
1256             {
1257                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1258             }
1259         }
1260 
1261         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1262         {
1263             return simpleWrite(val, n);
1264         }
1265     }
1266 
1267 private:
1268     // factor - number of elements in one machine word
1269     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1270     enum bytesPerWord =  size_t.sizeof;
1271     size_t* origin;
1272 }
1273 
1274 // data is packed only by power of two sized packs per word,
1275 // thus avoiding mul/div overhead at the cost of ultimate packing
1276 // this construct doesn't own memory, only provides access, see MultiArray for usage
1277 struct PackedArrayViewImpl(T, size_t bits)
1278 {
1279 pure nothrow:
1280 
1281     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1282     {
1283         ptr = inout(PackedPtr!(T))(origin);
1284         ofs = offset;
1285         limit = items;
1286     }
1287 
1288     bool zeros(size_t s, size_t e)
1289     in
1290     {
1291         assert(s <= e);
1292     }
1293     do
1294     {
1295         s += ofs;
1296         e += ofs;
1297         immutable pad_s = roundUp(s);
1298         if ( s >= e)
1299         {
1300             foreach (i; s .. e)
1301                 if (ptr[i])
1302                     return false;
1303             return true;
1304         }
1305         immutable pad_e = roundDown(e);
1306         size_t i;
1307         for (i=s; i<pad_s; i++)
1308             if (ptr[i])
1309                 return false;
1310         // all in between is x*factor elements
1311         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1312             if (ptr.origin[j])
1313                 return false;
1314         for (; i<e; i++)
1315             if (ptr[i])
1316                 return false;
1317         return true;
1318     }
1319 
1320     T opIndex(size_t idx) inout
1321     in
1322     {
1323         assert(idx < limit);
1324     }
1325     do
1326     {
1327         return ptr[ofs + idx];
1328     }
1329 
1330     static if (isBitPacked!T) // lack of user-defined implicit conversion
1331     {
1332         void opIndexAssign(T val, size_t idx)
1333         {
1334             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1335         }
1336     }
1337 
1338     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1339     in
1340     {
1341         assert(idx < limit);
1342     }
1343     do
1344     {
1345         ptr[ofs + idx] = val;
1346     }
1347 
1348     static if (isBitPacked!T) // lack of user-defined implicit conversions
1349     {
1350         void opSliceAssign(T val, size_t start, size_t end)
1351         {
1352             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1353         }
1354     }
1355 
1356     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1357     in
1358     {
1359         assert(start <= end);
1360         assert(end <= limit);
1361     }
1362     do
1363     {
1364         // account for ofsetted view
1365         start += ofs;
1366         end += ofs;
1367         // rounded to factor granularity
1368         immutable pad_start = roundUp(start);// rounded up
1369         if (pad_start >= end) //rounded up >= then end of slice
1370         {
1371             //nothing to gain, use per element assignment
1372             foreach (i; start .. end)
1373                 ptr[i] = val;
1374             return;
1375         }
1376         immutable pad_end = roundDown(end); // rounded down
1377         size_t i;
1378         for (i=start; i<pad_start; i++)
1379             ptr[i] = val;
1380         // all in between is x*factor elements
1381         if (pad_start != pad_end)
1382         {
1383             immutable repval = replicateBits!(factor, bits)(val);
1384             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1385                 ptr.origin[j] = repval;// so speed it up by factor
1386         }
1387         for (; i<end; i++)
1388             ptr[i] = val;
1389     }
1390 
1391     auto opSlice(size_t from, size_t to)inout
1392     in
1393     {
1394         assert(from <= to);
1395         assert(ofs + to <= limit);
1396     }
1397     do
1398     {
1399         return typeof(this)(ptr.origin, ofs + from, to - from);
1400     }
1401 
1402     auto opSlice(){ return opSlice(0, length); }
1403 
1404     bool opEquals(T)(auto ref T arr) const
1405     {
1406         if (limit != arr.limit)
1407            return false;
1408         size_t s1 = ofs, s2 = arr.ofs;
1409         size_t e1 = s1 + limit, e2 = s2 + limit;
1410         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1411         {
1412             return ptr.origin[s1/factor .. e1/factor]
1413                 == arr.ptr.origin[s2/factor .. e2/factor];
1414         }
1415         for (size_t i=0;i<limit; i++)
1416             if (this[i] != arr[i])
1417                 return false;
1418         return true;
1419     }
1420 
1421     @property size_t length()const{ return limit; }
1422 
1423 private:
1424     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1425     auto roundDown()(size_t val){ return val/factor*factor; }
1426     // factor - number of elements in one machine word
1427     enum factor = size_t.sizeof*8/bits;
1428     PackedPtr!(T) ptr;
1429     size_t ofs, limit;
1430 }
1431 
1432 
1433 private struct SliceOverIndexed(T)
1434 {
1435     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1436     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1437     auto opIndex(size_t idx)const
1438     in
1439     {
1440         assert(idx < to - from);
1441     }
1442     do
1443     {
1444         return (*arr)[from+idx];
1445     }
1446 
1447     static if (assignableIndex)
1448     void opIndexAssign(Item val, size_t idx)
1449     in
1450     {
1451         assert(idx < to - from);
1452     }
1453     do
1454     {
1455        (*arr)[from+idx] = val;
1456     }
1457 
1458     auto opSlice(size_t a, size_t b)
1459     {
1460         return typeof(this)(from+a, from+b, arr);
1461     }
1462 
1463     // static if (assignableSlice)
1464     void opSliceAssign(T)(T val, size_t start, size_t end)
1465     {
1466         (*arr)[start+from .. end+from] = val;
1467     }
1468 
1469     auto opSlice()
1470     {
1471         return typeof(this)(from, to, arr);
1472     }
1473 
1474     @property size_t length()const { return to-from;}
1475 
1476     auto opDollar()const { return length; }
1477 
1478     @property bool empty()const { return from == to; }
1479 
1480     @property auto front()const { return (*arr)[from]; }
1481 
1482     static if (assignableIndex)
1483     @property void front(Item val) { (*arr)[from] = val; }
1484 
1485     @property auto back()const { return (*arr)[to-1]; }
1486 
1487     static if (assignableIndex)
1488     @property void back(Item val) { (*arr)[to-1] = val; }
1489 
1490     @property auto save() inout { return this; }
1491 
1492     void popFront() {   from++; }
1493 
1494     void popBack() {    to--; }
1495 
1496     bool opEquals(T)(auto ref T arr) const
1497     {
1498         if (arr.length != length)
1499             return false;
1500         for (size_t i=0; i <length; i++)
1501             if (this[i] != arr[i])
1502                 return false;
1503         return true;
1504     }
1505 private:
1506     alias Item = typeof(T.init[0]);
1507     size_t from, to;
1508     T* arr;
1509 }
1510 
1511 @safe pure nothrow @nogc unittest
1512 {
1513     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1514 }
1515 
1516 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1517 if (is(Unqual!T == T))
1518 {
1519     return SliceOverIndexed!(const(T))(a, b, x);
1520 }
1521 
1522 // BUG? inout is out of reach
1523 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1524 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1525 if (is(Unqual!T == T))
1526 {
1527     return SliceOverIndexed!T(a, b, x);
1528 }
1529 
1530 @system unittest
1531 {
1532     int[] idxArray = [2, 3, 5, 8, 13];
1533     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1534 
1535     assert(!sliced.empty);
1536     assert(sliced.front == 2);
1537     sliced.front = 1;
1538     assert(sliced.front == 1);
1539     assert(sliced.back == 13);
1540     sliced.popFront();
1541     assert(sliced.front == 3);
1542     assert(sliced.back == 13);
1543     sliced.back = 11;
1544     assert(sliced.back == 11);
1545     sliced.popBack();
1546 
1547     assert(sliced.front == 3);
1548     assert(sliced[$-1] == 8);
1549     sliced = sliced[];
1550     assert(sliced[0] == 3);
1551     assert(sliced.back == 8);
1552     sliced = sliced[1..$];
1553     assert(sliced.front == 5);
1554     sliced = sliced[0..$-1];
1555     assert(sliced[$-1] == 5);
1556 
1557     int[] other = [2, 5];
1558     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1559     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1560     assert(idxArray[0 .. 2] == [-1, -1]);
1561     uint[] nullArr = null;
1562     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1563     assert(nullSlice.empty);
1564 }
1565 
1566 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1567 {
1568     return inout(PackedArrayView!T)(ptr, 0, items);
1569 }
1570 
1571 
1572 //============================================================================
1573 // Partially unrolled binary search using Shar's method
1574 //============================================================================
1575 
1576 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1577 {
1578     import core.bitop : bsr;
1579     import std.array : replace;
1580     import std.conv : to;
1581     assert(isPow2OrZero(size));
1582     string code = `
1583     import core.bitop : bsr;
1584     auto power = bsr(m)+1;
1585     switch (power){`;
1586     size_t i = bsr(size);
1587     foreach_reverse (val; 0 .. bsr(size))
1588     {
1589         auto v = 2^^val;
1590         code ~= `
1591         case pow:
1592             if (pred(range[idx+m], needle))
1593                 idx +=  m;
1594             goto case;
1595         `.replace("m", to!string(v))
1596         .replace("pow", to!string(i));
1597         i--;
1598     }
1599     code ~= `
1600         case 0:
1601             if (pred(range[idx], needle))
1602                 idx += 1;
1603             goto default;
1604         `;
1605     code ~= `
1606         default:
1607     }`;
1608     return code;
1609 }
1610 
1611 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1612 {
1613     // See also: std.math.isPowerOf2()
1614     return (sz & (sz-1)) == 0;
1615 }
1616 
1617 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1618 if (is(T : ElementType!Range))
1619 {
1620     assert(isPow2OrZero(range.length));
1621     size_t idx = 0, m = range.length/2;
1622     while (m != 0)
1623     {
1624         if (pred(range[idx+m], needle))
1625             idx += m;
1626         m /= 2;
1627     }
1628     if (pred(range[idx], needle))
1629         idx += 1;
1630     return idx;
1631 }
1632 
1633 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1634 if (is(T : ElementType!Range))
1635 {
1636     assert(isPow2OrZero(range.length));
1637     size_t idx = 0, m = range.length/2;
1638     enum max = 1 << 10;
1639     while (m >= max)
1640     {
1641         if (pred(range[idx+m], needle))
1642             idx += m;
1643         m /= 2;
1644     }
1645     mixin(genUnrolledSwitchSearch(max));
1646     return idx;
1647 }
1648 
1649 template sharMethod(alias uniLowerBound)
1650 {
1651     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1652         if (is(T : ElementType!Range))
1653     {
1654         import std.functional : binaryFun;
1655         import std.math : nextPow2, truncPow2;
1656         alias pred = binaryFun!_pred;
1657         if (range.length == 0)
1658             return 0;
1659         if (isPow2OrZero(range.length))
1660             return uniLowerBound!pred(range, needle);
1661         size_t n = truncPow2(range.length);
1662         if (pred(range[n-1], needle))
1663         {// search in another 2^^k area that fully covers the tail of range
1664             size_t k = nextPow2(range.length - n + 1);
1665             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1666         }
1667         else
1668             return uniLowerBound!pred(range[0 .. n], needle);
1669     }
1670 }
1671 
1672 alias sharLowerBound = sharMethod!uniformLowerBound;
1673 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1674 
1675 @safe unittest
1676 {
1677     import std.array : array;
1678     import std.range : assumeSorted, iota;
1679 
1680     auto stdLowerBound(T)(T[] range, T needle)
1681     {
1682         return assumeSorted(range).lowerBound(needle).length;
1683     }
1684     immutable MAX = 5*1173;
1685     auto arr = array(iota(5, MAX, 5));
1686     assert(arr.length == MAX/5-1);
1687     foreach (i; 0 .. MAX+5)
1688     {
1689         auto st = stdLowerBound(arr, i);
1690         assert(st == sharLowerBound(arr, i));
1691         assert(st == sharSwitchLowerBound(arr, i));
1692     }
1693     arr = [];
1694     auto st = stdLowerBound(arr, 33);
1695     assert(st == sharLowerBound(arr, 33));
1696     assert(st == sharSwitchLowerBound(arr, 33));
1697 }
1698 //============================================================================
1699 
1700 @safe
1701 {
1702 // hope to see simillar stuff in public interface... once Allocators are out
1703 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1704 
1705 @trusted size_t genericReplace(Policy=void, T, Range)
1706     (ref T dest, size_t from, size_t to, Range stuff)
1707 {
1708     import std.algorithm.mutation : copy;
1709     size_t delta = to - from;
1710     size_t stuff_end = from+stuff.length;
1711     if (stuff.length > delta)
1712     {// replace increases length
1713         delta = stuff.length - delta;// now, new is > old  by delta
1714         static if (is(Policy == void))
1715             dest.length = dest.length+delta;//@@@BUG lame @property
1716         else
1717             dest = Policy.realloc(dest, dest.length+delta);
1718         copyBackwards(dest[to .. dest.length-delta],
1719             dest[to+delta .. dest.length]);
1720         copyForward(stuff, dest[from .. stuff_end]);
1721     }
1722     else if (stuff.length == delta)
1723     {
1724         copy(stuff, dest[from .. to]);
1725     }
1726     else
1727     {// replace decreases length by delta
1728         delta = delta - stuff.length;
1729         copy(stuff, dest[from .. stuff_end]);
1730         copyForward(dest[to .. dest.length],
1731             dest[stuff_end .. dest.length-delta]);
1732         static if (is(Policy == void))
1733             dest.length = dest.length - delta;//@@@BUG lame @property
1734         else
1735             dest = Policy.realloc(dest, dest.length-delta);
1736     }
1737     return stuff_end;
1738 }
1739 
1740 
1741 // Simple storage manipulation policy
1742 @safe private struct GcPolicy
1743 {
1744     import std.traits : isDynamicArray;
1745 
1746     static T[] dup(T)(const T[] arr)
1747     {
1748         return arr.dup;
1749     }
1750 
1751     static T[] alloc(T)(size_t size)
1752     {
1753         return new T[size];
1754     }
1755 
1756     static T[] realloc(T)(T[] arr, size_t sz)
1757     {
1758         arr.length = sz;
1759         return arr;
1760     }
1761 
1762     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1763     {
1764         replaceInPlace(dest, from, to, stuff);
1765     }
1766 
1767     static void append(T, V)(ref T[] arr, V value)
1768         if (!isInputRange!V)
1769     {
1770         arr ~= force!T(value);
1771     }
1772 
1773     static void append(T, V)(ref T[] arr, V value)
1774         if (isInputRange!V)
1775     {
1776         insertInPlace(arr, arr.length, value);
1777     }
1778 
1779     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1780         if (isDynamicArray!T && is(Unqual!T == T))
1781     {
1782         debug
1783         {
1784             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1785         }
1786         arr = null;
1787     }
1788 
1789     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1790         if (isDynamicArray!T && !is(Unqual!T == T))
1791     {
1792         arr = null;
1793     }
1794 }
1795 
1796 // ditto
1797 @safe struct ReallocPolicy
1798 {
1799     import std.range.primitives : hasLength;
1800 
1801     static T[] dup(T)(const T[] arr)
1802     {
1803         auto result = alloc!T(arr.length);
1804         result[] = arr[];
1805         return result;
1806     }
1807 
1808     static T[] alloc(T)(size_t size) @trusted
1809     {
1810         import std.internal.memory : enforceMalloc;
1811 
1812         import core.checkedint : mulu;
1813         bool overflow;
1814         size_t nbytes = mulu(size, T.sizeof, overflow);
1815         if (overflow) assert(0);
1816 
1817         auto ptr = cast(T*) enforceMalloc(nbytes);
1818         return ptr[0 .. size];
1819     }
1820 
1821     static T[] realloc(T)(scope T[] arr, size_t size) @trusted
1822     {
1823         import std.internal.memory : enforceRealloc;
1824         if (!size)
1825         {
1826             destroy(arr);
1827             return null;
1828         }
1829 
1830         import core.checkedint : mulu;
1831         bool overflow;
1832         size_t nbytes = mulu(size, T.sizeof, overflow);
1833         if (overflow) assert(0);
1834 
1835         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1836         return ptr[0 .. size];
1837     }
1838 
1839     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1840     {
1841         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1842     }
1843 
1844     static void append(T, V)(ref T[] arr, V value)
1845         if (!isInputRange!V)
1846     {
1847         if (arr.length == size_t.max) assert(0);
1848         arr = realloc(arr, arr.length+1);
1849         arr[$-1] = force!T(value);
1850     }
1851 
1852     pure @safe unittest
1853     {
1854         int[] arr;
1855         ReallocPolicy.append(arr, 3);
1856 
1857         import std.algorithm.comparison : equal;
1858         assert(equal(arr, [3]));
1859     }
1860 
1861     static void append(T, V)(ref T[] arr, V value)
1862         if (isInputRange!V && hasLength!V)
1863     {
1864         import core.checkedint : addu;
1865         bool overflow;
1866         size_t nelems = addu(arr.length, value.length, overflow);
1867         if (overflow) assert(0);
1868 
1869         arr = realloc(arr, nelems);
1870 
1871         import std.algorithm.mutation : copy;
1872         copy(value, arr[$-value.length..$]);
1873     }
1874 
1875     pure @safe unittest
1876     {
1877         int[] arr;
1878         ReallocPolicy.append(arr, [1,2,3]);
1879 
1880         import std.algorithm.comparison : equal;
1881         assert(equal(arr, [1,2,3]));
1882     }
1883 
1884     static void destroy(T)(scope ref T[] arr) @trusted
1885     {
1886         import core.memory : pureFree;
1887         if (arr.ptr)
1888             pureFree(arr.ptr);
1889         arr = null;
1890     }
1891 }
1892 
1893 //build hack
1894 alias _RealArray = CowArray!ReallocPolicy;
1895 
1896 pure @safe unittest
1897 {
1898     import std.algorithm.comparison : equal;
1899 
1900     with(ReallocPolicy)
1901     {
1902         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1903                    string file = __FILE__, size_t line = __LINE__)
1904         {
1905             {
1906                 replaceImpl(orig, from, to, toReplace);
1907                 scope(exit) destroy(orig);
1908                 if (!equal(orig, result))
1909                     return false;
1910             }
1911             return true;
1912         }
1913         static T[] arr(T)(T[] args... )
1914         {
1915             return dup(args);
1916         }
1917 
1918         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1919         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1920         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1921         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1922         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1923     }
1924 }
1925 
1926 /**
1927     Tests if T is some kind a set of code points. Intended for template constraints.
1928 */
1929 public template isCodepointSet(T)
1930 {
1931     static if (is(T dummy == InversionList!(Args), Args...))
1932         enum isCodepointSet = true;
1933     else
1934         enum isCodepointSet = false;
1935 }
1936 
1937 /**
1938     Tests if `T` is a pair of integers that implicitly convert to `V`.
1939     The following code must compile for any pair `T`:
1940     ---
1941     (T x){ V a = x[0]; V b = x[1];}
1942     ---
1943     The following must not compile:
1944      ---
1945     (T x){ V c = x[2];}
1946     ---
1947 */
1948 public template isIntegralPair(T, V=uint)
1949 {
1950     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1951         && !is(typeof((T x){ V c = x[2]; }));
1952 }
1953 
1954 
1955 /**
1956     The recommended default type for set of $(CODEPOINTS).
1957     For details, see the current implementation: $(LREF InversionList).
1958 */
1959 public alias CodepointSet = InversionList!GcPolicy;
1960 
1961 
1962 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1963 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1964 // hence below doesn't seem to work
1965 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1966 
1967 /**
1968     The recommended type of $(REF Tuple, std,_typecons)
1969     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1970     Any interval type should pass $(LREF isIntegralPair) trait.
1971 */
1972 public struct CodepointInterval
1973 {
1974 pure:
1975     uint[2] _tuple;
1976     alias _tuple this;
1977 
1978 @safe pure nothrow @nogc:
1979 
1980     this(uint low, uint high)
1981     {
1982         _tuple[0] = low;
1983         _tuple[1] = high;
1984     }
1985     bool opEquals(T)(T val) const
1986     {
1987         return this[0] == val[0] && this[1] == val[1];
1988     }
1989     @property ref inout(uint) a() inout { return _tuple[0]; }
1990     @property ref inout(uint) b() inout { return _tuple[1]; }
1991 }
1992 
1993 /**
1994     $(P
1995     `InversionList` is a set of $(CODEPOINTS)
1996     represented as an array of open-right [a, b$(RPAREN)
1997     intervals (see $(LREF CodepointInterval) above).
1998     The name comes from the way the representation reads left to right.
1999     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2000     plus a singular value 60 looks like this:
2001     )
2002     ---
2003     10, 50, 60, 61, 80, 90
2004     ---
2005     $(P
2006     The way to read this is: start with negative meaning that all numbers
2007     smaller then the next one are not present in this set (and positive -
2008     the contrary). Then switch positive/negative after each
2009     number passed from left to right.
2010     )
2011     $(P This way negative spans until 10, then positive until 50,
2012     then negative until 60, then positive until 61, and so on.
2013     As seen this provides a space-efficient storage of highly redundant data
2014     that comes in long runs. A description which Unicode $(CHARACTER)
2015     properties fit nicely. The technique itself could be seen as a variation
2016     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2017     )
2018 
2019     $(P Sets are value types (just like `int` is) thus they
2020         are never aliased.
2021     )
2022         Example:
2023         ---
2024         auto a = CodepointSet('a', 'z'+1);
2025         auto b = CodepointSet('A', 'Z'+1);
2026         auto c = a;
2027         a = a | b;
2028         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2029         assert(a != c);
2030         ---
2031     $(P See also $(LREF unicode) for simpler construction of sets
2032         from predefined ones.
2033     )
2034 
2035     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2036     The value semantics are achieved by using the
2037     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2038     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2039     )
2040 
2041     Note:
2042     $(P It's not recommended to rely on the template parameters
2043     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2044     The type and parameters may change when the standard
2045     allocators design is finalized.
2046     Use $(LREF isCodepointSet) with templates or just stick with the default
2047     alias $(LREF CodepointSet) throughout the whole code base.
2048     )
2049 */
2050 public struct InversionList(SP=GcPolicy)
2051 {
2052     import std.range : assumeSorted;
2053 
2054     /**
2055         Construct from another code point set of any type.
2056     */
2057     this(Set)(Set set) pure
2058         if (isCodepointSet!Set)
2059     {
2060         uint[] arr;
2061         foreach (v; set.byInterval)
2062         {
2063             arr ~= v.a;
2064             arr ~= v.b;
2065         }
2066         data = CowArray!(SP).reuse(arr);
2067     }
2068 
2069     /**
2070         Construct a set from a forward range of code point intervals.
2071     */
2072     this(Range)(Range intervals) pure
2073         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2074     {
2075         uint[] arr;
2076         foreach (v; intervals)
2077         {
2078             SP.append(arr, v.a);
2079             SP.append(arr, v.b);
2080         }
2081         data = CowArray!(SP).reuse(arr);
2082         sanitize(); //enforce invariant: sort intervals etc.
2083     }
2084 
2085     //helper function that avoids sanity check to be CTFE-friendly
2086     private static fromIntervals(Range)(Range intervals) pure
2087     {
2088         import std.algorithm.iteration : map;
2089         import std.range : roundRobin;
2090         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2091             intervals.save.map!"a[1]"());
2092         InversionList set;
2093         set.data = CowArray!(SP)(flattened);
2094         return set;
2095     }
2096     //ditto untill sort is CTFE-able
2097     private static fromIntervals()(uint[] intervals...) pure
2098     in
2099     {
2100         import std.conv : text;
2101         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2102         for (uint i = 0; i < intervals.length; i += 2)
2103         {
2104             auto a = intervals[i], b = intervals[i+1];
2105             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2106         }
2107     }
2108     do
2109     {
2110         InversionList set;
2111         set.data = CowArray!(SP)(intervals);
2112         return set;
2113     }
2114 
2115     /**
2116         Construct a set from plain values of code point intervals.
2117     */
2118     this()(uint[] intervals...)
2119     in
2120     {
2121         import std.conv : text;
2122         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2123         for (uint i = 0; i < intervals.length; i += 2)
2124         {
2125             auto a = intervals[i], b = intervals[i+1];
2126             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2127         }
2128     }
2129     do
2130     {
2131         data = CowArray!(SP)(intervals);
2132         sanitize(); //enforce invariant: sort intervals etc.
2133     }
2134 
2135     ///
2136     pure @safe unittest
2137     {
2138         import std.algorithm.comparison : equal;
2139 
2140         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2141         foreach (v; 'a'..'z'+1)
2142             assert(set[v]);
2143         // Cyrillic lowercase interval
2144         foreach (v; 'а'..'я'+1)
2145             assert(set[v]);
2146         //specific order is not required, intervals may interesect
2147         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2148         //the same end result
2149         assert(set2.byInterval.equal(set.byInterval));
2150         // test constructor this(Range)(Range intervals)
2151         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2152         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2153         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2154         foreach (v; '♔'..'♟'+1)
2155             assert(set3[v]);
2156     }
2157 
2158     /**
2159         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2160     */
2161     @property auto byInterval() scope
2162     {
2163         // TODO: change this to data[] once the -dip1000 errors have been fixed
2164         // see e.g. https://github.com/dlang/phobos/pull/6638
2165         import std.array : array;
2166         return Intervals!(typeof(data.array))(data.array);
2167     }
2168 
2169     @safe unittest
2170     {
2171         import std.algorithm.comparison : equal;
2172         import std.typecons : tuple;
2173 
2174         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2175 
2176         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2177     }
2178 
2179     package(std) @property const(CodepointInterval)[] intervals() const
2180     {
2181         import std.array : array;
2182         return Intervals!(typeof(data[]))(data[]).array;
2183     }
2184 
2185     /**
2186         Tests the presence of code point `val` in this set.
2187     */
2188     bool opIndex(uint val) const
2189     {
2190         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2191         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2192         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2193     }
2194 
2195     ///
2196     pure @safe unittest
2197     {
2198         auto gothic = unicode.Gothic;
2199         // Gothic letter ahsa
2200         assert(gothic['\U00010330']);
2201         // no ascii in Gothic obviously
2202         assert(!gothic['$']);
2203     }
2204 
2205 
2206     // Linear scan for `ch`. Useful only for small sets.
2207     // TODO:
2208     // used internally in std.regex
2209     // should be properly exposed in a public API ?
2210     package(std) auto scanFor()(dchar ch) const
2211     {
2212         immutable len = data.length;
2213         for (size_t i = 0; i < len; i++)
2214             if (ch < data[i])
2215                 return i & 1;
2216         return 0;
2217     }
2218 
2219     /// Number of $(CODEPOINTS) in this set
2220     @property size_t length()
2221     {
2222         size_t sum = 0;
2223         foreach (iv; byInterval)
2224         {
2225             sum += iv.b - iv.a;
2226         }
2227         return sum;
2228     }
2229 
2230 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2231 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2232 //============================================================================
2233 public:
2234     /**
2235         $(P Sets support natural syntax for set algebra, namely: )
2236         $(BOOKTABLE ,
2237             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2238             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2239             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2240             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2241             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2242         )
2243     */
2244     This opBinary(string op, U)(U rhs)
2245         if (isCodepointSet!U || is(U:dchar))
2246     {
2247         static if (op == "&" || op == "|" || op == "~")
2248         {// symmetric ops thus can swap arguments to reuse r-value
2249             static if (is(U:dchar))
2250             {
2251                 auto tmp = this;
2252                 mixin("tmp "~op~"= rhs; ");
2253                 return tmp;
2254             }
2255             else
2256             {
2257                 static if (is(Unqual!U == U))
2258                 {
2259                     // try hard to reuse r-value
2260                     mixin("rhs "~op~"= this;");
2261                     return rhs;
2262                 }
2263                 else
2264                 {
2265                     auto tmp = this;
2266                     mixin("tmp "~op~"= rhs;");
2267                     return tmp;
2268                 }
2269             }
2270         }
2271         else static if (op == "-") // anti-symmetric
2272         {
2273             auto tmp = this;
2274             tmp -= rhs;
2275             return tmp;
2276         }
2277         else
2278             static assert(0, "no operator "~op~" defined for Set");
2279     }
2280 
2281     ///
2282     pure @safe unittest
2283     {
2284         import std.algorithm.comparison : equal;
2285         import std.range : iota;
2286 
2287         auto lower = unicode.LowerCase;
2288         auto upper = unicode.UpperCase;
2289         auto ascii = unicode.ASCII;
2290 
2291         assert((lower & upper).empty); // no intersection
2292         auto lowerASCII = lower & ascii;
2293         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2294         // throw away all of the lowercase ASCII
2295         assert((ascii - lower).length == 128 - 26);
2296 
2297         auto onlyOneOf = lower ~ ascii;
2298         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2299         assert(onlyOneOf['$']); // ASCII and not lowercase
2300         assert(!onlyOneOf['a']); // ASCII and lowercase
2301         assert(onlyOneOf['я']); // not ASCII but lowercase
2302 
2303         // throw away all cased letters from ASCII
2304         auto noLetters = ascii - (lower | upper);
2305         assert(noLetters.length == 128 - 26*2);
2306     }
2307 
2308     /// The 'op=' versions of the above overloaded operators.
2309     ref This opOpAssign(string op, U)(U rhs)
2310         if (isCodepointSet!U || is(U:dchar))
2311     {
2312         static if (op == "|")    // union
2313         {
2314             static if (is(U:dchar))
2315             {
2316                 this.addInterval(rhs, rhs+1);
2317                 return this;
2318             }
2319             else
2320                 return this.add(rhs);
2321         }
2322         else static if (op == "&")   // intersection
2323                 return this.intersect(rhs);// overloaded
2324         else static if (op == "-")   // set difference
2325                 return this.sub(rhs);// overloaded
2326         else static if (op == "~")   // symmetric set difference
2327         {
2328             auto copy = this & rhs;
2329             this |= rhs;
2330             this -= copy;
2331             return this;
2332         }
2333         else
2334             static assert(0, "no operator "~op~" defined for Set");
2335     }
2336 
2337     /**
2338         Tests the presence of codepoint `ch` in this set,
2339         the same as $(LREF opIndex).
2340     */
2341     bool opBinaryRight(string op: "in", U)(U ch) const
2342         if (is(U : dchar))
2343     {
2344         return this[ch];
2345     }
2346 
2347     ///
2348     pure @safe unittest
2349     {
2350         assert('я' in unicode.Cyrillic);
2351         assert(!('z' in unicode.Cyrillic));
2352     }
2353 
2354 
2355 
2356     /**
2357      * Obtains a set that is the inversion of this set.
2358      *
2359      * See_Also: $(LREF inverted)
2360      */
2361     auto opUnary(string op: "!")()
2362     {
2363         return this.inverted;
2364     }
2365 
2366     /**
2367         A range that spans each $(CODEPOINT) in this set.
2368     */
2369     @property auto byCodepoint()
2370     {
2371         static struct CodepointRange
2372         {
2373             this(This set)
2374             {
2375                 r = set.byInterval;
2376                 if (!r.empty)
2377                     cur = r.front.a;
2378             }
2379 
2380             @property dchar front() const
2381             {
2382                 return cast(dchar) cur;
2383             }
2384 
2385             @property bool empty() const
2386             {
2387                 return r.empty;
2388             }
2389 
2390             void popFront()
2391             {
2392                 cur++;
2393                 while (cur >= r.front.b)
2394                 {
2395                     r.popFront();
2396                     if (r.empty)
2397                         break;
2398                     cur = r.front.a;
2399                 }
2400             }
2401         private:
2402             uint cur;
2403             typeof(This.init.byInterval) r;
2404         }
2405 
2406         return CodepointRange(this);
2407     }
2408 
2409     ///
2410     pure @safe unittest
2411     {
2412         import std.algorithm.comparison : equal;
2413         import std.range : iota;
2414 
2415         auto set = unicode.ASCII;
2416         set.byCodepoint.equal(iota(0, 0x80));
2417     }
2418 
2419     /**
2420         $(P Obtain textual representation of this set in from of
2421         open-right intervals and feed it to `sink`.
2422         )
2423         $(P Used by various standard formatting facilities such as
2424          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2425          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2426         )
2427         Example:
2428         ---
2429         import std.conv;
2430         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2431         ---
2432     */
2433 
2434     private import std.format : FormatSpec;
2435 
2436     /***************************************
2437      * Obtain a textual representation of this InversionList
2438      * in form of open-right intervals.
2439      *
2440      * The formatting flag is applied individually to each value, for example:
2441      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2442      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2443      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2444      */
2445     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2446     {
2447         import std.format : formatValue;
2448         auto range = byInterval;
2449         if (range.empty)
2450             return;
2451 
2452         while (1)
2453         {
2454             auto i = range.front;
2455             range.popFront();
2456 
2457             put(sink, "[");
2458             formatValue(sink, i.a, fmt);
2459             put(sink, "..");
2460             formatValue(sink, i.b, fmt);
2461             put(sink, ")");
2462             if (range.empty) return;
2463             put(sink, " ");
2464         }
2465     }
2466 
2467     ///
2468     pure @safe unittest
2469     {
2470         import std.conv : to;
2471         import std.format : format;
2472         import std.uni : unicode;
2473 
2474         assert(unicode.Cyrillic.to!string ==
2475             "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2476 
2477         // The specs '%s' and '%d' are equivalent to the to!string call above.
2478         assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2479 
2480         assert(format("%#x", unicode.Cyrillic) ==
2481             "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2482             ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2483 
2484         assert(format("%#X", unicode.Cyrillic) ==
2485             "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2486             ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2487     }
2488 
2489     pure @safe unittest
2490     {
2491         import std.exception : assertThrown;
2492         import std.format : format, FormatException;
2493         assertThrown!FormatException(format("%a", unicode.ASCII));
2494     }
2495 
2496 
2497     /**
2498         Add an interval [a, b$(RPAREN) to this set.
2499     */
2500     ref add()(uint a, uint b)
2501     {
2502         addInterval(a, b);
2503         return this;
2504     }
2505 
2506     ///
2507     pure @safe unittest
2508     {
2509         CodepointSet someSet;
2510         someSet.add('0', '5').add('A','Z'+1);
2511         someSet.add('5', '9'+1);
2512         assert(someSet['0']);
2513         assert(someSet['5']);
2514         assert(someSet['9']);
2515         assert(someSet['Z']);
2516     }
2517 
2518 private:
2519 
2520   package(std)  // used from: std.regex.internal.parser
2521     ref intersect(U)(U rhs)
2522         if (isCodepointSet!U)
2523     {
2524         Marker mark;
2525         foreach ( i; rhs.byInterval)
2526         {
2527             mark = this.dropUpTo(i.a, mark);
2528             mark = this.skipUpTo(i.b, mark);
2529         }
2530         this.dropUpTo(uint.max, mark);
2531         return this;
2532     }
2533 
2534     ref intersect()(dchar ch)
2535     {
2536         foreach (i; byInterval)
2537             if (i.a <= ch && ch < i.b)
2538                 return this = This.init.add(ch, ch+1);
2539         this = This.init;
2540         return this;
2541     }
2542 
2543     pure @safe unittest
2544     {
2545         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2546     }
2547 
2548     ref sub()(dchar ch)
2549     {
2550         return subChar(ch);
2551     }
2552 
2553     // same as the above except that skip & drop parts are swapped
2554   package(std)  // used from: std.regex.internal.parser
2555     ref sub(U)(U rhs)
2556         if (isCodepointSet!U)
2557     {
2558         Marker mark;
2559         foreach (i; rhs.byInterval)
2560         {
2561             mark = this.skipUpTo(i.a, mark);
2562             mark = this.dropUpTo(i.b, mark);
2563         }
2564         return this;
2565     }
2566 
2567   package(std)  // used from: std.regex.internal.parse
2568     ref add(U)(U rhs)
2569         if (isCodepointSet!U)
2570     {
2571         Marker start;
2572         foreach (i; rhs.byInterval)
2573         {
2574             start = addInterval(i.a, i.b, start);
2575         }
2576         return this;
2577     }
2578 
2579 // end of mixin-able part
2580 //============================================================================
2581 public:
2582     /**
2583         Obtains a set that is the inversion of this set.
2584 
2585         See the '!' $(LREF opUnary) for the same but using operators.
2586     */
2587     @property auto inverted()
2588     {
2589         InversionList inversion = this;
2590         if (inversion.data.length == 0)
2591         {
2592             inversion.addInterval(0, lastDchar+1);
2593             return inversion;
2594         }
2595         if (inversion.data[0] != 0)
2596             genericReplace(inversion.data, 0, 0, [0]);
2597         else
2598             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2599         if (data[data.length-1] != lastDchar+1)
2600             genericReplace(inversion.data,
2601                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2602         else
2603             genericReplace(inversion.data,
2604                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2605 
2606         return inversion;
2607     }
2608 
2609     ///
2610     pure @safe unittest
2611     {
2612         auto set = unicode.ASCII;
2613         // union with the inverse gets all of the code points in the Unicode
2614         assert((set | set.inverted).length == 0x110000);
2615         // no intersection with the inverse
2616         assert((set & set.inverted).empty);
2617     }
2618 
2619     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2620     {
2621         import std.algorithm.searching : countUntil;
2622         import std.format : format;
2623         enum maxBinary = 3;
2624         static string linearScope(R)(R ivals, string indent)
2625         {
2626             string result = indent~"{\n";
2627             string deeper = indent~"    ";
2628             foreach (ival; ivals)
2629             {
2630                 immutable span = ival[1] - ival[0];
2631                 assert(span != 0);
2632                 if (span == 1)
2633                 {
2634                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2635                 }
2636                 else if (span == 2)
2637                 {
2638                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2639                         deeper, ival[0], ival[0]+1);
2640                 }
2641                 else
2642                 {
2643                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2644                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2645                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2646                 }
2647             }
2648             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2649             return result;
2650         }
2651 
2652         static string binaryScope(R)(R ivals, string indent) @safe
2653         {
2654             // time to do unrolled comparisons?
2655             if (ivals.length < maxBinary)
2656                 return linearScope(ivals, indent);
2657             else
2658                 return bisect(ivals, ivals.length/2, indent);
2659         }
2660 
2661         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2662         // and GDC is doing fine job either way
2663         static string switchScope(R)(R ivals, string indent)
2664         {
2665             string result = indent~"switch (ch){\n";
2666             string deeper = indent~"    ";
2667             foreach (ival; ivals)
2668             {
2669                 if (ival[0]+1 == ival[1])
2670                 {
2671                     result ~= format("%scase %s: return true;\n",
2672                         deeper, ival[0]);
2673                 }
2674                 else
2675                 {
2676                     result ~= format("%scase %s: .. case %s: return true;\n",
2677                          deeper, ival[0], ival[1]-1);
2678                 }
2679             }
2680             result ~= deeper~"default: return false;\n"~indent~"}\n";
2681             return result;
2682         }
2683 
2684         static string bisect(R)(R range, size_t idx, string indent)
2685         {
2686             string deeper = indent ~ "    ";
2687             // bisect on one [a, b) interval at idx
2688             string result = indent~"{\n";
2689             // less branch, < a
2690             result ~= format("%sif (ch < %s)\n%s",
2691                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2692             // middle point,  >= a && < b
2693             result ~= format("%selse if (ch < %s) return true;\n",
2694                 deeper, range[idx][1]);
2695             // greater or equal branch,  >= b
2696             result ~= format("%selse\n%s",
2697                 deeper, binaryScope(range[idx+1..$], deeper));
2698             return result~indent~"}\n";
2699         }
2700 
2701         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2702             funcName.empty ? "function" : funcName);
2703         // special case first bisection to be on ASCII vs beyond
2704         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2705         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2706             code ~= binaryScope(range, "");
2707         else
2708             code ~= bisect(range, tillAscii, "");
2709         return code;
2710     }
2711 
2712     /**
2713         Generates string with D source code of unary function with name of
2714         `funcName` taking a single `dchar` argument. If `funcName` is empty
2715         the code is adjusted to be a lambda function.
2716 
2717         The function generated tests if the $(CODEPOINT) passed
2718         belongs to this set or not. The result is to be used with string mixin.
2719         The intended usage area is aggressive optimization via meta programming
2720         in parser generators and the like.
2721 
2722         Note: Use with care for relatively small or regular sets. It
2723         could end up being slower then just using multi-staged tables.
2724 
2725         Example:
2726         ---
2727         import std.stdio;
2728 
2729         // construct set directly from [a, b$RPAREN intervals
2730         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2731         writeln(set);
2732         writeln(set.toSourceCode("func"));
2733         ---
2734 
2735         The above outputs something along the lines of:
2736         ---
2737         bool func(dchar ch)  @safe pure nothrow @nogc
2738         {
2739             if (ch < 45)
2740             {
2741                 if (ch == 10 || ch == 11) return true;
2742                 return false;
2743             }
2744             else if (ch < 65) return true;
2745             else
2746             {
2747                 if (ch < 100) return false;
2748                 if (ch < 200) return true;
2749                 return false;
2750             }
2751         }
2752         ---
2753     */
2754     string toSourceCode(string funcName="")
2755     {
2756         import std.array : array;
2757         auto range = byInterval.array();
2758         return toSourceCode(range, funcName);
2759     }
2760 
2761     /**
2762         True if this set doesn't contain any $(CODEPOINTS).
2763     */
2764     @property bool empty() const
2765     {
2766         return data.length == 0;
2767     }
2768 
2769     ///
2770     pure @safe unittest
2771     {
2772         CodepointSet emptySet;
2773         assert(emptySet.length == 0);
2774         assert(emptySet.empty);
2775     }
2776 
2777 private:
2778     alias This = typeof(this);
2779     alias Marker = size_t;
2780 
2781     // a random-access range of integral pairs
2782     static struct Intervals(Range)
2783     {
2784         import std.range.primitives : hasAssignableElements;
2785 
2786         this(Range sp) scope
2787         {
2788             slice = sp;
2789             start = 0;
2790             end = sp.length;
2791         }
2792 
2793         this(Range sp, size_t s, size_t e) scope
2794         {
2795             slice = sp;
2796             start = s;
2797             end = e;
2798         }
2799 
2800         @property auto front()const
2801         {
2802             immutable a = slice[start];
2803             immutable b = slice[start+1];
2804             return CodepointInterval(a, b);
2805         }
2806 
2807         //may break sorted property - but we need std.sort to access it
2808         //hence package(std) protection attribute
2809         static if (hasAssignableElements!Range)
2810         package(std) @property void front(CodepointInterval val)
2811         {
2812             slice[start] = val.a;
2813             slice[start+1] = val.b;
2814         }
2815 
2816         @property auto back()const
2817         {
2818             immutable a = slice[end-2];
2819             immutable b = slice[end-1];
2820             return CodepointInterval(a, b);
2821         }
2822 
2823         //ditto about package
2824         static if (hasAssignableElements!Range)
2825         package(std) @property void back(CodepointInterval val)
2826         {
2827             slice[end-2] = val.a;
2828             slice[end-1] = val.b;
2829         }
2830 
2831         void popFront()
2832         {
2833             start += 2;
2834         }
2835 
2836         void popBack()
2837         {
2838             end -= 2;
2839         }
2840 
2841         auto opIndex(size_t idx) const
2842         {
2843             immutable a = slice[start+idx*2];
2844             immutable b = slice[start+idx*2+1];
2845             return CodepointInterval(a, b);
2846         }
2847 
2848         //ditto about package
2849         static if (hasAssignableElements!Range)
2850         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2851         {
2852             slice[start+idx*2] = val.a;
2853             slice[start+idx*2+1] = val.b;
2854         }
2855 
2856         auto opSlice(size_t s, size_t e)
2857         {
2858             return Intervals(slice, s*2+start, e*2+start);
2859         }
2860 
2861         @property size_t length()const {  return slice.length/2; }
2862 
2863         @property bool empty()const { return start == end; }
2864 
2865         @property auto save(){ return this; }
2866     private:
2867         size_t start, end;
2868         Range slice;
2869     }
2870 
2871     // called after construction from intervals
2872     // to make sure invariants hold
2873     void sanitize()
2874     {
2875         import std.algorithm.comparison : max;
2876         import std.algorithm.mutation : SwapStrategy;
2877         import std.algorithm.sorting : sort;
2878         if (data.length == 0)
2879             return;
2880         alias Ival = CodepointInterval;
2881         //intervals wrapper for a _range_ over packed array
2882         auto ivals = Intervals!(typeof(data[]))(data[]);
2883         //@@@BUG@@@ can't use "a.a < b.a" see
2884         // https://issues.dlang.org/show_bug.cgi?id=12265
2885         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2886         // what follows is a variation on stable remove
2887         // differences:
2888         // - predicate is binary, and is tested against
2889         //   the last kept element (at 'i').
2890         // - predicate mutates lhs (merges rhs into lhs)
2891         size_t len = ivals.length;
2892         size_t i = 0;
2893         size_t j = 1;
2894         while (j < len)
2895         {
2896             if (ivals[i].b >= ivals[j].a)
2897             {
2898                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2899                 j++;
2900             }
2901             else //unmergable
2902             {
2903                 // check if there is a hole after merges
2904                 // (in the best case we do 0 writes to ivals)
2905                 if (j != i+1)
2906                     ivals[i+1] = ivals[j]; //copy over
2907                 i++;
2908                 j++;
2909             }
2910         }
2911         len = i + 1;
2912         for (size_t k=0; k + 1 < len; k++)
2913         {
2914             assert(ivals[k].a < ivals[k].b);
2915             assert(ivals[k].b < ivals[k+1].a);
2916         }
2917         data.length = len * 2;
2918     }
2919 
2920     // special case for normal InversionList
2921     ref subChar(dchar ch)
2922     {
2923         auto mark = skipUpTo(ch);
2924         if (mark != data.length
2925             && data[mark] == ch && data[mark-1] == ch)
2926         {
2927             // it has split, meaning that ch happens to be in one of intervals
2928             data[mark] = data[mark]+1;
2929         }
2930         return this;
2931     }
2932 
2933     //
2934     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2935     in
2936     {
2937         assert(a <= b);
2938     }
2939     do
2940     {
2941         import std.range : assumeSorted, SearchPolicy;
2942         auto range = assumeSorted(data[]);
2943         size_t pos;
2944         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2945         if (a_idx == range.length)
2946         {
2947             //  [---+++----++++----++++++]
2948             //  [                         a  b]
2949             data.append(a, b);
2950             return data.length-1;
2951         }
2952         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2953         uint[3] buf = void;
2954         uint to_insert;
2955         debug(std_uni)
2956         {
2957             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2958         }
2959         if (b_idx == range.length)
2960         {
2961             //  [-------++++++++----++++++-]
2962             //  [      s     a                 b]
2963             if (a_idx & 1)// a in positive
2964             {
2965                 buf[0] = b;
2966                 to_insert = 1;
2967             }
2968             else// a in negative
2969             {
2970                 buf[0] = a;
2971                 buf[1] = b;
2972                 to_insert = 2;
2973             }
2974             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2975             return pos - 1;
2976         }
2977 
2978         uint top = data[b_idx];
2979 
2980         debug(std_uni)
2981         {
2982             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2983             writefln("a=%s; b=%s; top=%s;", a, b, top);
2984         }
2985         if (a_idx & 1)
2986         {// a in positive
2987             if (b_idx & 1)// b in positive
2988             {
2989                 //  [-------++++++++----++++++-]
2990                 //  [       s    a        b    ]
2991                 buf[0] = top;
2992                 to_insert = 1;
2993             }
2994             else // b in negative
2995             {
2996                 //  [-------++++++++----++++++-]
2997                 //  [       s    a   b         ]
2998                 if (top == b)
2999                 {
3000                     assert(b_idx+1 < data.length);
3001                     buf[0] = data[b_idx+1];
3002                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3003                     return pos - 1;
3004                 }
3005                 buf[0] = b;
3006                 buf[1] = top;
3007                 to_insert = 2;
3008             }
3009         }
3010         else
3011         { // a in negative
3012             if (b_idx & 1) // b in positive
3013             {
3014                 //  [----------+++++----++++++-]
3015                 //  [     a     b              ]
3016                 buf[0] = a;
3017                 buf[1] = top;
3018                 to_insert = 2;
3019             }
3020             else// b in negative
3021             {
3022                 //  [----------+++++----++++++-]
3023                 //  [  a       s      b        ]
3024                 if (top == b)
3025                 {
3026                     assert(b_idx+1 < data.length);
3027                     buf[0] = a;
3028                     buf[1] = data[b_idx+1];
3029                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3030                     return pos - 1;
3031                 }
3032                 buf[0] = a;
3033                 buf[1] = b;
3034                 buf[2] = top;
3035                 to_insert = 3;
3036             }
3037         }
3038         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3039         debug(std_uni)
3040         {
3041             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3042             writeln("inserting ", buf[0 .. to_insert]);
3043         }
3044         return pos - 1;
3045     }
3046 
3047     //
3048     Marker dropUpTo(uint a, Marker pos=Marker.init)
3049     in
3050     {
3051         assert(pos % 2 == 0); // at start of interval
3052     }
3053     do
3054     {
3055         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3056         if (range.empty)
3057             return pos;
3058         size_t idx = pos;
3059         idx += range.lowerBound(a).length;
3060 
3061         debug(std_uni)
3062         {
3063             writeln("dropUpTo full length=", data.length);
3064             writeln(pos,"~~~", idx);
3065         }
3066         if (idx == data.length)
3067             return genericReplace(data, pos, idx, cast(uint[])[]);
3068         if (idx & 1)
3069         {   // a in positive
3070             //[--+++----++++++----+++++++------...]
3071             //      |<---si       s  a  t
3072             genericReplace(data, pos, idx, [a]);
3073         }
3074         else
3075         {   // a in negative
3076             //[--+++----++++++----+++++++-------+++...]
3077             //      |<---si              s  a  t
3078             genericReplace(data, pos, idx, cast(uint[])[]);
3079         }
3080         return pos;
3081     }
3082 
3083     //
3084     Marker skipUpTo(uint a, Marker pos=Marker.init)
3085     out(result)
3086     {
3087         assert(result % 2 == 0);// always start of interval
3088         //(may be  0-width after-split)
3089     }
3090     do
3091     {
3092         assert(data.length % 2 == 0);
3093         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3094         size_t idx = pos+range.lowerBound(a).length;
3095 
3096         if (idx >= data.length) // could have Marker point to recently removed stuff
3097             return data.length;
3098 
3099         if (idx & 1)// inside of interval, check for split
3100         {
3101 
3102             immutable top = data[idx];
3103             if (top == a)// no need to split, it's end
3104                 return idx+1;
3105             immutable start = data[idx-1];
3106             if (a == start)
3107                 return idx-1;
3108             // split it up
3109             genericReplace(data, idx, idx+1, [a, a, top]);
3110             return idx+1;        // avoid odd index
3111         }
3112         return idx;
3113     }
3114 
3115     CowArray!SP data;
3116 }
3117 
3118 pure @system unittest
3119 {
3120     import std.conv : to;
3121     assert(unicode.ASCII.to!string() == "[0..128)");
3122 }
3123 
3124 // pedantic version for ctfe, and aligned-access only architectures
3125 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3126 {
3127     idx *= 3;
3128     version (LittleEndian)
3129         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3130              + (cast(uint) ptr[idx+2]<<16);
3131     else
3132         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3133              + ptr[idx+2];
3134 }
3135 
3136 // ditto
3137 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3138 {
3139     idx *= 3;
3140     version (LittleEndian)
3141     {
3142         ptr[idx] = val & 0xFF;
3143         ptr[idx+1] = (val >> 8) & 0xFF;
3144         ptr[idx+2] = (val >> 16) & 0xFF;
3145     }
3146     else
3147     {
3148         ptr[idx] = (val >> 16) & 0xFF;
3149         ptr[idx+1] = (val >> 8) & 0xFF;
3150         ptr[idx+2] = val & 0xFF;
3151     }
3152 }
3153 
3154 // unaligned x86-like read/write functions
3155 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3156 {
3157     uint* src = cast(uint*)(ptr+3*idx);
3158     version (LittleEndian)
3159         return *src & 0xFF_FFFF;
3160     else
3161         return *src >> 8;
3162 }
3163 
3164 // ditto
3165 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3166 {
3167     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3168     version (LittleEndian)
3169         *dest = val | (*dest & 0xFF00_0000);
3170     else
3171         *dest = (val << 8) | (*dest & 0xFF);
3172 }
3173 
3174 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3175 {
3176     static if (hasUnalignedReads)
3177         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3178     else
3179         return safeRead24(ptr, idx);
3180 }
3181 
3182 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3183 {
3184     static if (hasUnalignedReads)
3185         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3186     else
3187         return safeWrite24(ptr, val, idx);
3188 }
3189 
3190 struct CowArray(SP=GcPolicy)
3191 {
3192     import std.range.primitives : hasLength;
3193 
3194   @safe:
3195     static auto reuse(uint[] arr)
3196     {
3197         CowArray cow;
3198         cow.data = arr;
3199         SP.append(cow.data, 1);
3200         assert(cow.refCount == 1);
3201         assert(cow.length == arr.length);
3202         return cow;
3203     }
3204 
3205     this(Range)(Range range)
3206         if (isInputRange!Range && hasLength!Range)
3207     {
3208         import std.algorithm.mutation : copy;
3209         length = range.length;
3210         copy(range, data[0..$-1]);
3211     }
3212 
3213     this(Range)(Range range)
3214         if (isForwardRange!Range && !hasLength!Range)
3215     {
3216         import std.algorithm.mutation : copy;
3217         import std.range.primitives : walkLength;
3218         immutable len = walkLength(range.save);
3219         length = len;
3220         copy(range, data[0..$-1]);
3221     }
3222 
3223     this(this)
3224     {
3225         if (!empty)
3226         {
3227             refCount = refCount + 1;
3228         }
3229     }
3230 
3231     ~this()
3232     {
3233         if (!empty)
3234         {
3235             immutable cnt = refCount;
3236             if (cnt == 1)
3237                 SP.destroy(data);
3238             else
3239                 refCount = cnt - 1;
3240         }
3241     }
3242 
3243     // no ref-count for empty U24 array
3244     @property bool empty() const { return data.length == 0; }
3245 
3246     // report one less then actual size
3247     @property size_t length() const
3248     {
3249         return data.length ? data.length - 1 : 0;
3250     }
3251 
3252     //+ an extra slot for ref-count
3253     @property void length(size_t len)
3254     {
3255         import std.algorithm.comparison : min;
3256         import std.algorithm.mutation : copy;
3257         if (len == 0)
3258         {
3259             if (!empty)
3260                 freeThisReference();
3261             return;
3262         }
3263         immutable total = len + 1; // including ref-count
3264         if (empty)
3265         {
3266             data = SP.alloc!uint(total);
3267             refCount = 1;
3268             return;
3269         }
3270         immutable cur_cnt = refCount;
3271         if (cur_cnt != 1) // have more references to this memory
3272         {
3273             refCount = cur_cnt - 1;
3274             auto new_data = SP.alloc!uint(total);
3275             // take shrinking into account
3276             auto to_copy = min(total, data.length) - 1;
3277             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3278             data = new_data; // before setting refCount!
3279             refCount = 1;
3280         }
3281         else // 'this' is the only reference
3282         {
3283             // use the realloc (hopefully in-place operation)
3284             data = SP.realloc(data, total);
3285             refCount = 1; // setup a ref-count in the new end of the array
3286         }
3287     }
3288 
3289     alias opDollar = length;
3290 
3291     uint opIndex()(size_t idx)const
3292     {
3293         return data[idx];
3294     }
3295 
3296     void opIndexAssign(uint val, size_t idx)
3297     {
3298         auto cnt = refCount;
3299         if (cnt != 1)
3300             dupThisReference(cnt);
3301         data[idx] = val;
3302     }
3303 
3304     //
3305     auto opSlice(size_t from, size_t to)
3306     {
3307         if (!empty)
3308         {
3309             auto cnt = refCount;
3310             if (cnt != 1)
3311                 dupThisReference(cnt);
3312         }
3313         return data[from .. to];
3314 
3315     }
3316 
3317     //
3318     auto opSlice(size_t from, size_t to) const
3319     {
3320         return data[from .. to];
3321     }
3322 
3323     // length slices before the ref count
3324     auto opSlice()
3325     {
3326         return opSlice(0, length);
3327     }
3328 
3329     // ditto
3330     auto opSlice() const
3331     {
3332         return opSlice(0, length);
3333     }
3334 
3335     void append(Range)(Range range)
3336         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3337     {
3338         size_t nl = length + range.length;
3339         length = nl;
3340         copy(range, this[nl-range.length .. nl]);
3341     }
3342 
3343     void append()(uint[] val...)
3344     {
3345         length = length + val.length;
3346         data[$-val.length-1 .. $-1] = val[];
3347     }
3348 
3349     bool opEquals()(auto const ref CowArray rhs)const
3350     {
3351         if (empty ^ rhs.empty)
3352             return false; // one is empty and the other isn't
3353         return empty || data[0..$-1] == rhs.data[0..$-1];
3354     }
3355 
3356 private:
3357     // ref-count is right after the data
3358     @property uint refCount() const
3359     {
3360         return data[$-1];
3361     }
3362 
3363     @property void refCount(uint cnt)
3364     {
3365         data[$-1] = cnt;
3366     }
3367 
3368     void freeThisReference()
3369     {
3370         immutable count = refCount;
3371         if (count != 1) // have more references to this memory
3372         {
3373             // dec shared ref-count
3374             refCount = count - 1;
3375             data = [];
3376         }
3377         else
3378             SP.destroy(data);
3379         assert(!data.ptr);
3380     }
3381 
3382     void dupThisReference(uint count)
3383     in
3384     {
3385         assert(!empty && count != 1 && count == refCount);
3386     }
3387     do
3388     {
3389         import std.algorithm.mutation : copy;
3390         // dec shared ref-count
3391         refCount = count - 1;
3392         // copy to the new chunk of RAM
3393         auto new_data = SP.alloc!uint(data.length);
3394         // bit-blit old stuff except the counter
3395         copy(data[0..$-1], new_data[0..$-1]);
3396         data = new_data; // before setting refCount!
3397         refCount = 1; // so that this updates the right one
3398     }
3399 
3400     uint[] data;
3401 }
3402 
3403 pure @safe unittest// Uint24 tests
3404 {
3405     import std.algorithm.comparison : equal;
3406     import std.algorithm.mutation : copy;
3407     import std.conv : text;
3408     import std.range : iota, chain;
3409     import std.range.primitives : isBidirectionalRange, isOutputRange;
3410     void funcRef(T)(ref T u24)
3411     {
3412         u24.length = 2;
3413         u24[1] = 1024;
3414         T u24_c = u24;
3415         assert(u24[1] == 1024);
3416         u24.length = 0;
3417         assert(u24.empty);
3418         u24.append([1, 2]);
3419         assert(equal(u24[], [1, 2]));
3420         u24.append(111);
3421         assert(equal(u24[], [1, 2, 111]));
3422         assert(!u24_c.empty && u24_c[1] == 1024);
3423         u24.length = 3;
3424         copy(iota(0, 3), u24[]);
3425         assert(equal(u24[], iota(0, 3)));
3426         assert(u24_c[1] == 1024);
3427     }
3428 
3429     void func2(T)(T u24)
3430     {
3431         T u24_2 = u24;
3432         T u24_3;
3433         u24_3 = u24_2;
3434         assert(u24_2 == u24_3);
3435         assert(equal(u24[], u24_2[]));
3436         assert(equal(u24_2[], u24_3[]));
3437         funcRef(u24_3);
3438 
3439         assert(equal(u24_3[], iota(0, 3)));
3440         assert(!equal(u24_2[], u24_3[]));
3441         assert(equal(u24_2[], u24[]));
3442         u24_2 = u24_3;
3443         assert(equal(u24_2[], iota(0, 3)));
3444         // to test that passed arg is intact outside
3445         // plus try out opEquals
3446         u24 = u24_3;
3447         u24 = T.init;
3448         u24_3 = T.init;
3449         assert(u24.empty);
3450         assert(u24 == u24_3);
3451         assert(u24 != u24_2);
3452     }
3453 
3454     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3455     {{
3456         alias Range = typeof(CowArray!Policy.init[]);
3457         alias U24A = CowArray!Policy;
3458         static assert(isForwardRange!Range);
3459         static assert(isBidirectionalRange!Range);
3460         static assert(isOutputRange!(Range, uint));
3461         static assert(isRandomAccessRange!(Range));
3462 
3463         auto arr = U24A([42u, 36, 100]);
3464         assert(arr[0] == 42);
3465         assert(arr[1] == 36);
3466         arr[0] = 72;
3467         arr[1] = 0xFE_FEFE;
3468         assert(arr[0] == 72);
3469         assert(arr[1] == 0xFE_FEFE);
3470         assert(arr[2] == 100);
3471         U24A arr2 = arr;
3472         assert(arr2[0] == 72);
3473         arr2[0] = 11;
3474         // test COW-ness
3475         assert(arr[0] == 72);
3476         assert(arr2[0] == 11);
3477         // set this to about 100M to stress-test COW memory management
3478         foreach (v; 0 .. 10_000)
3479             func2(arr);
3480         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3481 
3482         auto r2 = U24A(iota(0, 100));
3483         assert(equal(r2[], iota(0, 100)), text(r2[]));
3484         copy(iota(10, 170, 2), r2[10 .. 90]);
3485         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3486                , text(r2[]));
3487     }}
3488 }
3489 
3490 pure @safe unittest// core set primitives test
3491 {
3492     import std.conv : text;
3493     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3494     foreach (CodeList; AllSets)
3495     {
3496         CodeList a;
3497         //"plug a hole" test
3498         a.add(10, 20).add(25, 30).add(15, 27);
3499         assert(a == CodeList(10, 30), text(a));
3500 
3501         auto x = CodeList.init;
3502         x.add(10, 20).add(30, 40).add(50, 60);
3503 
3504         a = x;
3505         a.add(20, 49);//[10, 49) [50, 60)
3506         assert(a == CodeList(10, 49, 50 ,60));
3507 
3508         a = x;
3509         a.add(20, 50);
3510         assert(a == CodeList(10, 60), text(a));
3511 
3512         // simple unions, mostly edge effects
3513         x = CodeList.init;
3514         x.add(10, 20).add(40, 60);
3515 
3516         a = x;
3517         a.add(10, 25); //[10, 25) [40, 60)
3518         assert(a == CodeList(10, 25, 40, 60));
3519 
3520         a = x;
3521         a.add(5, 15); //[5, 20) [40, 60)
3522         assert(a == CodeList(5, 20, 40, 60));
3523 
3524         a = x;
3525         a.add(0, 10); // [0, 20) [40, 60)
3526         assert(a == CodeList(0, 20, 40, 60));
3527 
3528         a = x;
3529         a.add(0, 5); // prepand
3530         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3531 
3532         a = x;
3533         a.add(5, 20);
3534         assert(a == CodeList(5, 20, 40, 60));
3535 
3536         a = x;
3537         a.add(3, 37);
3538         assert(a == CodeList(3, 37, 40, 60));
3539 
3540         a = x;
3541         a.add(37, 65);
3542         assert(a == CodeList(10, 20, 37, 65));
3543 
3544         // some tests on helpers for set intersection
3545         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3546         a = x;
3547 
3548         auto m = a.skipUpTo(60);
3549         a.dropUpTo(110, m);
3550         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3551 
3552         a = x;
3553         a.dropUpTo(100);
3554         assert(a == CodeList(100, 120), text(a.data[]));
3555 
3556         a = x;
3557         m = a.skipUpTo(50);
3558         a.dropUpTo(140, m);
3559         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3560         a = x;
3561         a.dropUpTo(60);
3562         assert(a == CodeList(100, 120), text(a.data[]));
3563     }
3564 }
3565 
3566 
3567 //test constructor to work with any order of intervals
3568 pure @safe unittest
3569 {
3570     import std.algorithm.comparison : equal;
3571     import std.conv : text, to;
3572     import std.range : chain, iota;
3573     import std.typecons : tuple;
3574     //ensure constructor handles bad ordering and overlap
3575     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3576     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3577         assert(ch in c1, to!string(ch));
3578 
3579     //contiguos
3580     assert(CodepointSet(1000, 1006, 1006, 1009)
3581         .byInterval.equal([tuple(1000, 1009)]));
3582     //contains
3583     assert(CodepointSet(900, 1200, 1000, 1100)
3584         .byInterval.equal([tuple(900, 1200)]));
3585     //intersect left
3586     assert(CodepointSet(900, 1100, 1000, 1200)
3587         .byInterval.equal([tuple(900, 1200)]));
3588     //intersect right
3589     assert(CodepointSet(1000, 1200, 900, 1100)
3590         .byInterval.equal([tuple(900, 1200)]));
3591 
3592     //ditto with extra items at end
3593     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3594         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3595     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3596         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3597 
3598     //"plug a hole" test
3599     auto c2 = CodepointSet(20, 40,
3600         60, 80, 100, 140, 150, 200,
3601         40, 60, 80, 100, 140, 150
3602     );
3603     assert(c2.byInterval.equal([tuple(20, 200)]));
3604 
3605     auto c3 = CodepointSet(
3606         20, 40, 60, 80, 100, 140, 150, 200,
3607         0, 10, 15, 100, 10, 20, 200, 220);
3608     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3609 }
3610 
3611 
3612 pure @safe unittest
3613 {   // full set operations
3614     import std.conv : text;
3615     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3616     foreach (CodeList; AllSets)
3617     {
3618         CodeList a, b, c, d;
3619 
3620         //"plug a hole"
3621         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3622         b.add(40, 60).add(80, 100).add(140, 150);
3623         c = a | b;
3624         d = b | a;
3625         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3626         assert(c == d, text(c," vs ", d));
3627 
3628         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3629         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3630         d = b | a;
3631         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3632         assert(c == d, text(c," vs ", d));
3633 
3634         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3635         c = a | b;//[10, 140) [145, 200)
3636         d = b | a;
3637         assert(c == CodeList(10, 140, 145, 200));
3638         assert(c == d, text(c," vs ", d));
3639 
3640         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3641         c = a | b;//[0, 140) [150, 220)
3642         d = b | a;
3643         assert(c == CodeList(0, 140, 150, 220));
3644         assert(c == d, text(c," vs ", d));
3645 
3646 
3647         a = CodeList.init.add(20, 40).add(60, 80);
3648         b = CodeList.init.add(25, 35).add(65, 75);
3649         c = a & b;
3650         d = b & a;
3651         assert(c == CodeList(25, 35, 65, 75), text(c));
3652         assert(c == d, text(c," vs ", d));
3653 
3654         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3655         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3656         c = a & b;
3657         d = b & a;
3658         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3659         assert(c == d, text(c," vs ", d));
3660 
3661         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3662         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3663         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3664         d = b & a;
3665 
3666         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3667         assert(c == d, text(c, " vs ",d));
3668         assert((c & a) == c);
3669         assert((d & b) == d);
3670         assert((c & d) == d);
3671 
3672         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3673         c = a & b;
3674         d = b & a;
3675         assert(c == CodeList(150, 200), text(c));
3676         assert(c == d, text(c, " vs ",d));
3677         assert((c & a) == c);
3678         assert((d & b) == d);
3679         assert((c & d) == d);
3680 
3681         assert((a & a) == a);
3682         assert((b & b) == b);
3683 
3684         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3685         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3686         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3687         d = b - a;// [40, 60) [80, 100) [200, 300)
3688         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3689         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3690         assert(c - d == c, text(c-d, " vs ", c));
3691         assert(d - c == d, text(d-c, " vs ", d));
3692         assert(c - c == CodeList.init);
3693         assert(d - d == CodeList.init);
3694 
3695         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3696         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3697         c = a - b;// [160, 190)
3698         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3699         assert(c == CodeList(160, 190), text(c));
3700         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3701         assert(c - d == c, text(c-d, " vs ", c));
3702         assert(d - c == d, text(d-c, " vs ", d));
3703         assert(c - c == CodeList.init);
3704         assert(d - d == CodeList.init);
3705 
3706         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3707         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3708         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3709         d = b ~ a;
3710         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3711                text(c));
3712         assert(c == d, text(c, " vs ", d));
3713     }
3714 }
3715 
3716 }
3717 
3718 pure @safe unittest// vs single dchar
3719 {
3720     import std.conv : text;
3721     CodepointSet a = CodepointSet(10, 100, 120, 200);
3722     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3723     assert((a & 'B') == CodepointSet(66, 67));
3724 }
3725 
3726 pure @safe unittest// iteration & opIndex
3727 {
3728     import std.algorithm.comparison : equal;
3729     import std.conv : text;
3730     import std.typecons : tuple, Tuple;
3731 
3732     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3733     {{
3734         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3735         auto a = CodeList('A','N','a', 'n');
3736         assert(equal(a.byInterval,
3737                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3738             ), text(a.byInterval));
3739 
3740         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3741         version (bug8949)
3742         {
3743             import std.range : retro;
3744             assert(equal(retro(a.byInterval),
3745                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3746             ), text(retro(a.byInterval)));
3747         }
3748         auto achr = a.byCodepoint;
3749         assert(equal(achr, arr), text(a.byCodepoint));
3750         foreach (ch; a.byCodepoint)
3751             assert(a[ch]);
3752         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3753         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3754         foreach (ch; x.byCodepoint)
3755             assert(x[ch]);
3756         static if (is(CodeList == CodepointSet))
3757         {
3758             auto y = CodeList(x.byInterval);
3759             assert(equal(x.byInterval, y.byInterval));
3760         }
3761         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3762         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3763     }}
3764 }
3765 
3766 //============================================================================
3767 // Generic Trie template and various ways to build it
3768 //============================================================================
3769 
3770 // debug helper to get a shortened array dump
3771 auto arrayRepr(T)(T x)
3772 {
3773     import std.conv : text;
3774     if (x.length > 32)
3775     {
3776         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3777     }
3778     else
3779         return text(x);
3780 }
3781 
3782 /**
3783     Maps `Key` to a suitable integer index within the range of `size_t`.
3784     The mapping is constructed by applying predicates from `Prefix` left to right
3785     and concatenating the resulting bits.
3786 
3787     The first (leftmost) predicate defines the most significant bits of
3788     the resulting index.
3789  */
3790 template mapTrieIndex(Prefix...)
3791 {
3792     size_t mapTrieIndex(Key)(Key key)
3793         if (isValidPrefixForTrie!(Key, Prefix))
3794     {
3795         alias p = Prefix;
3796         size_t idx;
3797         foreach (i, v; p[0..$-1])
3798         {
3799             idx |= p[i](key);
3800             idx <<= p[i+1].bitSize;
3801         }
3802         idx |= p[$-1](key);
3803         return idx;
3804     }
3805 }
3806 
3807 /*
3808     `TrieBuilder` is a type used for incremental construction
3809     of $(LREF Trie)s.
3810 
3811     See $(LREF buildTrie) for generic helpers built on top of it.
3812 */
3813 @trusted private struct TrieBuilder(Value, Key, Args...)
3814 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3815 {
3816     import std.exception : enforce;
3817 
3818 private:
3819     // last index is not stored in table, it is used as an offset to values in a block.
3820     static if (is(Value == bool))// always pack bool
3821         alias V = BitPacked!(Value, 1);
3822     else
3823         alias V = Value;
3824     static auto deduceMaxIndex(Preds...)()
3825     {
3826         size_t idx = 1;
3827         foreach (v; Preds)
3828             idx *= 2^^v.bitSize;
3829         return idx;
3830     }
3831 
3832     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3833     {
3834         alias Prefix = Args[1..$];
3835         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3836         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3837         enum roughedMaxIndex =
3838             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3839         // check warp around - if wrapped, use the default deduction rule
3840         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3841             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3842     }
3843     else
3844     {
3845         alias Prefix = Args;
3846         enum maxIndex = deduceMaxIndex!(Prefix)();
3847     }
3848 
3849     alias getIndex = mapTrieIndex!(Prefix);
3850 
3851     enum lastLevel = Prefix.length-1;
3852     struct ConstructState
3853     {
3854         size_t idx_zeros, idx_ones;
3855     }
3856     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3857     size_t[Prefix.length] indices;
3858     // default filler value to use
3859     Value defValue;
3860     // this is a full-width index of next item
3861     size_t curIndex;
3862     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3863     ConstructState[Prefix.length] state;
3864     // the table being constructed
3865     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3866 
3867     @disable this();
3868 
3869     //shortcut for index variable at level 'level'
3870     @property ref idx(size_t level)(){ return indices[level]; }
3871 
3872     // this function assumes no holes in the input so
3873     // indices are going one by one
3874     void addValue(size_t level, T)(T val, size_t numVals)
3875     {
3876         alias j = idx!level;
3877         enum pageSize = 1 << Prefix[level].bitSize;
3878         if (numVals == 0)
3879             return;
3880         auto ptr = table.slice!(level);
3881         if (numVals == 1)
3882         {
3883             static if (level == Prefix.length-1)
3884                 ptr[j] = val;
3885             else
3886             {// can incur narrowing conversion
3887                 assert(j < ptr.length);
3888                 ptr[j] = force!(typeof(ptr[j]))(val);
3889             }
3890             j++;
3891             if (j % pageSize == 0)
3892                 spillToNextPage!level(ptr);
3893             return;
3894         }
3895         // longer row of values
3896         // get to the next page boundary
3897         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3898         immutable n =  nextPB - j;// can fill right in this page
3899         if (numVals < n) //fits in current page
3900         {
3901             ptr[j .. j+numVals]  = val;
3902             j += numVals;
3903             return;
3904         }
3905         static if (level != 0)//on the first level it always fits
3906         {
3907             numVals -= n;
3908             //write till the end of current page
3909             ptr[j .. j+n]  = val;
3910             j += n;
3911             //spill to the next page
3912             spillToNextPage!level(ptr);
3913             // page at once loop
3914             if (state[level].idx_zeros != size_t.max && val == T.init)
3915             {
3916                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3917                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3918                     numVals/pageSize);
3919                 ptr = table.slice!level; //table structure might have changed
3920                 numVals %= pageSize;
3921             }
3922             else
3923             {
3924                 while (numVals >= pageSize)
3925                 {
3926                     numVals -= pageSize;
3927                     ptr[j .. j+pageSize]  = val;
3928                     j += pageSize;
3929                     spillToNextPage!level(ptr);
3930                 }
3931             }
3932             if (numVals)
3933             {
3934                 // the leftovers, an incomplete page
3935                 ptr[j .. j+numVals]  = val;
3936                 j += numVals;
3937             }
3938         }
3939     }
3940 
3941     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3942     {
3943         // last level (i.e. topmost) has 1 "page"
3944         // thus it need not to add a new page on upper level
3945         static if (level != 0)
3946             spillToNextPageImpl!(level)(ptr);
3947     }
3948 
3949     // this can re-use the current page if duplicate or allocate a new one
3950     // it also makes sure that previous levels point to the correct page in this level
3951     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3952     {
3953         alias NextIdx = typeof(table.slice!(level-1)[0]);
3954         NextIdx next_lvl_index;
3955         enum pageSize = 1 << Prefix[level].bitSize;
3956         assert(idx!level % pageSize == 0);
3957         immutable last = idx!level-pageSize;
3958         const slice = ptr[idx!level - pageSize .. idx!level];
3959         size_t j;
3960         for (j=0; j<last; j+=pageSize)
3961         {
3962             if (ptr[j .. j+pageSize] == slice)
3963             {
3964                 // get index to it, reuse ptr space for the next block
3965                 next_lvl_index = force!NextIdx(j/pageSize);
3966                 version (none)
3967                 {
3968                 import std.stdio : writefln, writeln;
3969                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3970                         ,level
3971                         ,indices[level-1], pageSize, j, j+pageSize);
3972                 writeln("LEVEL(", level
3973                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3974                 writeln("LEVEL(", level
3975                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3976                 }
3977                 idx!level -= pageSize; // reuse this page, it is duplicate
3978                 break;
3979             }
3980         }
3981         if (j == last)
3982         {
3983     L_allocate_page:
3984             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3985             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3986             {
3987                 state[level].idx_zeros = next_lvl_index;
3988             }
3989             // allocate next page
3990             version (none)
3991             {
3992             import std.stdio : writefln;
3993             writefln("LEVEL(%s) page allocated: %s"
3994                      , level, arrayRepr(slice[0 .. pageSize]));
3995             writefln("LEVEL(%s) index: %s ; page at this index %s"
3996                      , level
3997                      , next_lvl_index
3998                      , arrayRepr(
3999                          table.slice!(level)
4000                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4001                         ));
4002             }
4003             table.length!level = table.length!level + pageSize;
4004         }
4005     L_know_index:
4006         // for the previous level, values are indices to the pages in the current level
4007         addValue!(level-1)(next_lvl_index, 1);
4008         ptr = table.slice!level; //re-load the slice after moves
4009     }
4010 
4011     // idx - full-width index to fill with v (full-width index != key)
4012     // fills everything in the range of [curIndex, idx) with filler
4013     void putAt(size_t idx, Value v)
4014     {
4015         assert(idx >= curIndex);
4016         immutable numFillers = idx - curIndex;
4017         addValue!lastLevel(defValue, numFillers);
4018         addValue!lastLevel(v, 1);
4019         curIndex = idx + 1;
4020     }
4021 
4022     // ditto, but sets the range of [idxA, idxB) to v
4023     void putRangeAt(size_t idxA, size_t idxB, Value v)
4024     {
4025         assert(idxA >= curIndex);
4026         assert(idxB >= idxA);
4027         size_t numFillers = idxA - curIndex;
4028         addValue!lastLevel(defValue, numFillers);
4029         addValue!lastLevel(v, idxB - idxA);
4030         curIndex = idxB; // open-right
4031     }
4032 
4033     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4034         "duplicate key->value mapping";
4035 
4036 public:
4037     /**
4038         Construct a builder, where `filler` is a value
4039         to indicate empty slots (or "not found" condition).
4040     */
4041     this(Value filler)
4042     {
4043         curIndex = 0;
4044         defValue = filler;
4045         // zeros-page index, ones-page index
4046         foreach (ref v; state)
4047             v = ConstructState(size_t.max, size_t.max);
4048         table = typeof(table)(indices);
4049         // one page per level is a bootstrap minimum
4050         foreach (i, Pred; Prefix)
4051             table.length!i = (1 << Pred.bitSize);
4052     }
4053 
4054     /**
4055         Put a value `v` into interval as
4056         mapped by keys from `a` to `b`.
4057         All slots prior to `a` are filled with
4058         the default filler.
4059     */
4060     void putRange(Key a, Key b, Value v)
4061     {
4062         auto idxA = getIndex(a), idxB = getIndex(b);
4063         // indexes of key should always grow
4064         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4065         putRangeAt(idxA, idxB, v);
4066     }
4067 
4068     /**
4069         Put a value `v` into slot mapped by `key`.
4070         All slots prior to `key` are filled with the
4071         default filler.
4072     */
4073     void putValue(Key key, Value v)
4074     {
4075         auto idx = getIndex(key);
4076         enforce(idx >= curIndex, errMsg);
4077         putAt(idx, v);
4078     }
4079 
4080     /// Finishes construction of Trie, yielding an immutable Trie instance.
4081     auto build()
4082     {
4083         static if (maxIndex != 0) // doesn't cover full range of size_t
4084         {
4085             assert(curIndex <= maxIndex);
4086             addValue!lastLevel(defValue, maxIndex - curIndex);
4087         }
4088         else
4089         {
4090             if (curIndex != 0 // couldn't wrap around
4091                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4092             {
4093                 addValue!lastLevel(defValue, size_t.max - curIndex);
4094                 addValue!lastLevel(defValue, 1);
4095             }
4096             // else curIndex already completed the full range of size_t by wrapping around
4097         }
4098         return Trie!(V, Key, maxIndex, Prefix)(table);
4099     }
4100 }
4101 
4102 /**
4103     $(P A generic Trie data-structure for a fixed number of stages.
4104     The design goal is optimal speed with smallest footprint size.
4105     )
4106     $(P It's intentionally read-only and doesn't provide constructors.
4107      To construct one use a special builder,
4108      see $(LREF TrieBuilder) and $(LREF buildTrie).
4109     )
4110 
4111 */
4112 @trusted private struct Trie(Value, Key, Args...)
4113 if (isValidPrefixForTrie!(Key, Args)
4114     || (isValidPrefixForTrie!(Key, Args[1..$])
4115     && is(typeof(Args[0]) : size_t)))
4116 {
4117     import std.range.primitives : isOutputRange;
4118     static if (is(typeof(Args[0]) : size_t))
4119     {
4120         private enum maxIndex = Args[0];
4121         private enum hasBoundsCheck = true;
4122         private alias Prefix = Args[1..$];
4123     }
4124     else
4125     {
4126         private enum hasBoundsCheck = false;
4127         private alias Prefix = Args;
4128     }
4129 
4130     private this()(typeof(_table) table)
4131     {
4132         _table = table;
4133     }
4134 
4135     // only for constant Tries constructed from precompiled tables
4136     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4137         const(size_t)[] data) const
4138     {
4139         _table = typeof(_table)(offsets, sizes, data);
4140     }
4141 
4142     /**
4143         $(P Lookup the `key` in this `Trie`. )
4144 
4145         $(P The lookup always succeeds if key fits the domain
4146         provided during construction. The whole domain defined
4147         is covered so instead of not found condition
4148         the sentinel (filler) value could be used. )
4149 
4150         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4151         define a domain of `Trie` keys and the sentinel value. )
4152 
4153         Note:
4154         Domain range-checking is only enabled in debug builds
4155         and results in assertion failure.
4156     */
4157     TypeOfBitPacked!Value opIndex()(Key key) const
4158     {
4159         static if (hasBoundsCheck)
4160             assert(mapTrieIndex!Prefix(key) < maxIndex);
4161         size_t idx;
4162         alias p = Prefix;
4163         idx = cast(size_t) p[0](key);
4164         foreach (i, v; p[0..$-1])
4165             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4166         return _table.ptr!(p.length-1)[idx];
4167     }
4168 
4169     ///
4170     @property size_t bytes(size_t n=size_t.max)() const
4171     {
4172         return _table.bytes!n;
4173     }
4174 
4175     ///
4176     @property size_t pages(size_t n)() const
4177     {
4178         return (bytes!n+2^^(Prefix[n].bitSize-1))
4179                 /2^^Prefix[n].bitSize;
4180     }
4181 
4182     ///
4183     void store(OutRange)(scope OutRange sink) const
4184         if (isOutputRange!(OutRange, char))
4185     {
4186         _table.store(sink);
4187     }
4188 
4189 private:
4190     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4191 }
4192 
4193 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4194 // left-to-right, the most significant bits first
4195 template GetBitSlicing(size_t top, sizes...)
4196 {
4197     static if (sizes.length > 0)
4198         alias GetBitSlicing =
4199             AliasSeq!(sliceBits!(top - sizes[0], top),
4200                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4201     else
4202         alias GetBitSlicing = AliasSeq!();
4203 }
4204 
4205 template callableWith(T)
4206 {
4207     template callableWith(alias Pred)
4208     {
4209         static if (!is(typeof(Pred(T.init))))
4210             enum callableWith = false;
4211         else
4212         {
4213             alias Result = typeof(Pred(T.init));
4214             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4215         }
4216     }
4217 }
4218 
4219 /*
4220     Check if `Prefix` is a valid set of predicates
4221     for `Trie` template having `Key` as the type of keys.
4222     This requires all predicates to be callable, take
4223     single argument of type `Key` and return unsigned value.
4224 */
4225 template isValidPrefixForTrie(Key, Prefix...)
4226 {
4227     import std.meta : allSatisfy;
4228     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4229 }
4230 
4231 /*
4232     Check if `Args` is a set of maximum key value followed by valid predicates
4233     for `Trie` template having `Key` as the type of keys.
4234 */
4235 template isValidArgsForTrie(Key, Args...)
4236 {
4237     static if (Args.length > 1)
4238     {
4239         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4240             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4241     }
4242     else
4243         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4244 }
4245 
4246 @property size_t sumOfIntegerTuple(ints...)()
4247 {
4248     size_t count=0;
4249     foreach (v; ints)
4250         count += v;
4251     return count;
4252 }
4253 
4254 /**
4255     A shorthand for creating a custom multi-level fixed Trie
4256     from a `CodepointSet`. `sizes` are numbers of bits per level,
4257     with the most significant bits used first.
4258 
4259     Note: The sum of `sizes` must be equal 21.
4260 
4261     See_Also: $(LREF toTrie), which is even simpler.
4262 
4263     Example:
4264     ---
4265     {
4266         import std.stdio;
4267         auto set = unicode("Number");
4268         auto trie = codepointSetTrie!(8, 5, 8)(set);
4269         writeln("Input code points to test:");
4270         foreach (line; stdin.byLine)
4271         {
4272             int count=0;
4273             foreach (dchar ch; line)
4274                 if (trie[ch])// is number
4275                     count++;
4276             writefln("Contains %d number code points.", count);
4277         }
4278     }
4279     ---
4280 */
4281 public template codepointSetTrie(sizes...)
4282 if (sumOfIntegerTuple!sizes == 21)
4283 {
4284     auto codepointSetTrie(Set)(Set set)
4285         if (isCodepointSet!Set)
4286     {
4287         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4288         foreach (ival; set.byInterval)
4289             builder.putRange(ival[0], ival[1], true);
4290         return builder.build();
4291     }
4292 }
4293 
4294 /// Type of Trie generated by codepointSetTrie function.
4295 public template CodepointSetTrie(sizes...)
4296 if (sumOfIntegerTuple!sizes == 21)
4297 {
4298     alias Prefix = GetBitSlicing!(21, sizes);
4299     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4300 }
4301 
4302 /**
4303     A slightly more general tool for building fixed `Trie`
4304     for the Unicode data.
4305 
4306     Specifically unlike `codepointSetTrie` it's allows creating mappings
4307     of `dchar` to an arbitrary type `T`.
4308 
4309     Note: Overload taking `CodepointSet`s will naturally convert
4310     only to bool mapping `Trie`s.
4311 
4312     CodepointTrie is the type of Trie as generated by codepointTrie function.
4313 */
4314 public template codepointTrie(T, sizes...)
4315 if (sumOfIntegerTuple!sizes == 21)
4316 {
4317     alias Prefix = GetBitSlicing!(21, sizes);
4318 
4319     static if (is(TypeOfBitPacked!T == bool))
4320     {
4321         auto codepointTrie(Set)(const scope Set set)
4322             if (isCodepointSet!Set)
4323         {
4324             return codepointSetTrie(set);
4325         }
4326     }
4327 
4328     ///
4329     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4330     {
4331         return buildTrie!(T, dchar, Prefix)(map, defValue);
4332     }
4333 
4334     // unsorted range of pairs
4335     ///
4336     auto codepointTrie(R)(R range, T defValue=T.init)
4337         if (isInputRange!R
4338             && is(typeof(ElementType!R.init[0]) : T)
4339             && is(typeof(ElementType!R.init[1]) : dchar))
4340     {
4341         // build from unsorted array of pairs
4342         // TODO: expose index sorting functions for Trie
4343         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4344     }
4345 }
4346 
4347 @system pure unittest
4348 {
4349     import std.algorithm.comparison : max;
4350     import std.algorithm.searching : count;
4351 
4352     // pick characters from the Greek script
4353     auto set = unicode.Greek;
4354 
4355     // a user-defined property (or an expensive function)
4356     // that we want to look up
4357     static uint luckFactor(dchar ch)
4358     {
4359         // here we consider a character lucky
4360         // if its code point has a lot of identical hex-digits
4361         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4362         ubyte[6] nibbles; // 6 4-bit chunks of code point
4363         uint value = ch;
4364         foreach (i; 0 .. 6)
4365         {
4366             nibbles[i] = value & 0xF;
4367             value >>= 4;
4368         }
4369         uint luck;
4370         foreach (n; nibbles)
4371             luck = cast(uint) max(luck, count(nibbles[], n));
4372         return luck;
4373     }
4374 
4375     // only unsigned built-ins are supported at the moment
4376     alias LuckFactor = BitPacked!(uint, 3);
4377 
4378     // create a temporary associative array (AA)
4379     LuckFactor[dchar] map;
4380     foreach (ch; set.byCodepoint)
4381         map[ch] = LuckFactor(luckFactor(ch));
4382 
4383     // bits per stage are chosen randomly, fell free to optimize
4384     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4385 
4386     // from now on the AA is not needed
4387     foreach (ch; set.byCodepoint)
4388         assert(trie[ch] == luckFactor(ch)); // verify
4389     // CJK is not Greek, thus it has the default value
4390     assert(trie['\u4444'] == 0);
4391     // and here is a couple of quite lucky Greek characters:
4392     // Greek small letter epsilon with dasia
4393     assert(trie['\u1F11'] == 3);
4394     // Ancient Greek metretes sign
4395     assert(trie['\U00010181'] == 3);
4396 
4397 }
4398 
4399 /// ditto
4400 public template CodepointTrie(T, sizes...)
4401 if (sumOfIntegerTuple!sizes == 21)
4402 {
4403     alias Prefix = GetBitSlicing!(21, sizes);
4404     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4405 }
4406 
4407 package(std) template cmpK0(alias Pred)
4408 {
4409     import std.typecons : Tuple;
4410     static bool cmpK0(Value, Key)
4411         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4412     {
4413         return Pred(a[1]) < Pred(b[1]);
4414     }
4415 }
4416 
4417 /**
4418     The most general utility for construction of `Trie`s
4419     short of using `TrieBuilder` directly.
4420 
4421     Provides a number of convenience overloads.
4422     `Args` is tuple of maximum key value followed by
4423     predicates to construct index from key.
4424 
4425     Alternatively if the first argument is not a value convertible to `Key`
4426     then the whole tuple of `Args` is treated as predicates
4427     and the maximum Key is deduced from predicates.
4428 */
4429 private template buildTrie(Value, Key, Args...)
4430 if (isValidArgsForTrie!(Key, Args))
4431 {
4432     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4433     {
4434         alias Prefix = Args[1..$];
4435     }
4436     else
4437         alias Prefix = Args;
4438 
4439     alias getIndex = mapTrieIndex!(Prefix);
4440 
4441     // for multi-sort
4442     template GetComparators(size_t n)
4443     {
4444         static if (n > 0)
4445             alias GetComparators =
4446                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4447         else
4448             alias GetComparators = AliasSeq!();
4449     }
4450 
4451     /*
4452         Build `Trie` from a range of a Key-Value pairs,
4453         assuming it is sorted by Key as defined by the following lambda:
4454         ------
4455         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4456         ------
4457         Exception is thrown if it's detected that the above order doesn't hold.
4458 
4459         In other words $(LREF mapTrieIndex) should be a
4460         monotonically increasing function that maps `Key` to an integer.
4461 
4462         See_Also: $(REF sort, std,_algorithm),
4463         $(REF SortedRange, std,range),
4464         $(REF setUnion, std,_algorithm).
4465     */
4466     auto buildTrie(Range)(Range range, Value filler=Value.init)
4467         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4468             && is(typeof(Range.init.front[1]) : Key))
4469     {
4470         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4471         foreach (v; range)
4472             builder.putValue(v[1], v[0]);
4473         return builder.build();
4474     }
4475 
4476     /*
4477         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4478         to build `Trie` from a range of open-right intervals of `Key`s.
4479         The requirement  on the ordering of keys (and the behavior on the
4480         violation of it) is the same as for Key-Value range overload.
4481 
4482         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4483         If no filler provided keys inside of the intervals map to true,
4484         and `filler` is false.
4485     */
4486     auto buildTrie(Range)(Range range, Value filler=Value.init)
4487         if (is(TypeOfBitPacked!Value ==  bool)
4488             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4489             && is(typeof(Range.init.front[1]) : Key))
4490     {
4491         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4492         foreach (ival; range)
4493             builder.putRange(ival[0], ival[1], !filler);
4494         return builder.build();
4495     }
4496 
4497     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4498         if (isInputRange!Range
4499             && is(typeof(Range.init.front[0]) : Value)
4500             && is(typeof(Range.init.front[1]) : Key))
4501     {
4502         import std.algorithm.sorting : multiSort;
4503         alias Comps = GetComparators!(Prefix.length);
4504         if (unsorted)
4505             multiSort!(Comps)(range);
4506         return buildTrie(range, filler);
4507     }
4508 
4509     /*
4510         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4511         to build `Trie` simply from an input range of `Key`s.
4512         The requirement  on the ordering of keys (and the behavior on the
4513         violation of it) is the same as for Key-Value range overload.
4514 
4515         Keys found in range denote !`filler` i.e. the opposite of filler.
4516         If no filler provided keys map to true, and `filler` is false.
4517     */
4518     auto buildTrie(Range)(Range range, Value filler=Value.init)
4519         if (is(TypeOfBitPacked!Value ==  bool)
4520             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4521     {
4522         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4523         foreach (v; range)
4524             builder.putValue(v, !filler);
4525         return builder.build();
4526     }
4527 
4528     /*
4529         If `Key` is unsigned integer `Trie` could be constructed from array
4530         of values where array index serves as key.
4531     */
4532     auto buildTrie()(Value[] array, Value filler=Value.init)
4533         if (isUnsigned!Key)
4534     {
4535         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4536         foreach (idx, v; array)
4537             builder.putValue(idx, v);
4538         return builder.build();
4539     }
4540 
4541     /*
4542         Builds `Trie` from associative array.
4543     */
4544     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4545     {
4546         import std.array : array;
4547         import std.range : zip;
4548         auto range = array(zip(map.values, map.keys));
4549         return buildTrie(range, filler, true); // sort it
4550     }
4551 }
4552 
4553 // helper in place of assumeSize to
4554 //reduce mangled name & help DMD inline Trie functors
4555 struct clamp(size_t bits)
4556 {
4557     static size_t opCall(T)(T arg){ return arg; }
4558     enum bitSize = bits;
4559 }
4560 
4561 struct clampIdx(size_t idx, size_t bits)
4562 {
4563     static size_t opCall(T)(T arg){ return arg[idx]; }
4564     enum bitSize = bits;
4565 }
4566 
4567 /**
4568     Conceptual type that outlines the common properties of all UTF Matchers.
4569 
4570     Note: For illustration purposes only, every method
4571     call results in assertion failure.
4572     Use $(LREF utfMatcher) to obtain a concrete matcher
4573     for UTF-8 or UTF-16 encodings.
4574 */
4575 public struct MatcherConcept
4576 {
4577     /**
4578         $(P Perform a semantic equivalent 2 operations:
4579         decoding a $(CODEPOINT) at front of `inp` and testing if
4580         it belongs to the set of $(CODEPOINTS) of this matcher. )
4581 
4582         $(P The effect on `inp` depends on the kind of function called:)
4583 
4584         $(P Match. If the codepoint is found in the set then range `inp`
4585         is advanced by its size in $(S_LINK Code unit, code units),
4586         otherwise the range is not modifed.)
4587 
4588         $(P Skip. The range is always advanced by the size
4589         of the tested $(CODEPOINT) regardless of the result of test.)
4590 
4591         $(P Test. The range is left unaffected regardless
4592         of the result of test.)
4593     */
4594     public bool match(Range)(ref Range inp)
4595         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4596     {
4597        assert(false);
4598     }
4599 
4600     ///ditto
4601     public bool skip(Range)(ref Range inp)
4602         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4603     {
4604         assert(false);
4605     }
4606 
4607     ///ditto
4608     public bool test(Range)(ref Range inp)
4609         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4610     {
4611         assert(false);
4612     }
4613     ///
4614     pure @safe unittest
4615     {
4616         string truth = "2² = 4";
4617         auto m = utfMatcher!char(unicode.Number);
4618         assert(m.match(truth)); // '2' is a number all right
4619         assert(truth == "² = 4"); // skips on match
4620         assert(m.match(truth)); // so is the superscript '2'
4621         assert(!m.match(truth)); // space is not a number
4622         assert(truth == " = 4"); // unaffected on no match
4623         assert(!m.skip(truth)); // same test ...
4624         assert(truth == "= 4"); // but skips a codepoint regardless
4625         assert(!m.test(truth)); // '=' is not a number
4626         assert(truth == "= 4"); // test never affects argument
4627     }
4628 
4629     /**
4630         Advanced feature - provide direct access to a subset of matcher based a
4631         set of known encoding lengths. Lengths are provided in
4632         $(S_LINK Code unit, code units). The sub-matcher then may do less
4633         operations per any `test`/`match`.
4634 
4635         Use with care as the sub-matcher won't match
4636         any $(CODEPOINTS) that have encoded length that doesn't belong
4637         to the selected set of lengths. Also the sub-matcher object references
4638         the parent matcher and must not be used past the liftetime
4639         of the latter.
4640 
4641         Another caveat of using sub-matcher is that skip is not available
4642         preciesly because sub-matcher doesn't detect all lengths.
4643     */
4644     @property auto subMatcher(Lengths...)()
4645     {
4646         assert(0);
4647         return this;
4648     }
4649 
4650     pure @safe unittest
4651     {
4652         auto m = utfMatcher!char(unicode.Number);
4653         string square = "2²";
4654         // about sub-matchers
4655         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4656         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4657         assert(!m.subMatcher!1.test(square)); // unicode '²'
4658         assert(m.subMatcher!(2,3,4).match(square));  //
4659         assert(square == "");
4660         wstring wsquare = "2²";
4661         auto m16 = utfMatcher!wchar(unicode.Number);
4662         // may keep ref, but the orignal (m16) must be kept alive
4663         auto bmp = m16.subMatcher!1;
4664         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4665         assert(bmp.match(wsquare)); // And '²' too
4666     }
4667 }
4668 
4669 /**
4670     Test if `M` is an UTF Matcher for ranges of `Char`.
4671 */
4672 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4673     C[] s;
4674     auto d = s.decoder;
4675     M m;
4676     assert(is(typeof(m.match(d)) == bool));
4677     assert(is(typeof(m.test(d)) == bool));
4678     static if (is(typeof(m.skip(d))))
4679     {
4680         assert(is(typeof(m.skip(d)) == bool));
4681         assert(is(typeof(m.skip(s)) == bool));
4682     }
4683     assert(is(typeof(m.match(s)) == bool));
4684     assert(is(typeof(m.test(s)) == bool));
4685 });
4686 
4687 pure @safe unittest
4688 {
4689     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4690     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4691     static assert(isUtfMatcher!(CharMatcher, char));
4692     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4693     static assert(isUtfMatcher!(WcharMatcher, wchar));
4694     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4695 }
4696 
4697 enum Mode {
4698     alwaysSkip,
4699     neverSkip,
4700     skipOnMatch
4701 }
4702 
4703 mixin template ForwardStrings()
4704 {
4705     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4706     {
4707         import std.utf : byCodeUnit;
4708         alias type = typeof(byCodeUnit(str));
4709         return mixin(fn~"(*cast(type*)&str)");
4710     }
4711 }
4712 
4713 template Utf8Matcher()
4714 {
4715     enum validSize(int sz) = sz >= 1 && sz <= 4;
4716 
4717     void badEncoding() pure @safe
4718     {
4719         import std.utf : UTFException;
4720         throw new UTFException("Invalid UTF-8 sequence");
4721     }
4722 
4723     //for 1-stage ASCII
4724     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4725     //for 2-stage lookup of 2 byte UTF-8 sequences
4726     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4727         clampIdx!(0, 5), clampIdx!(1, 6));
4728     //ditto for 3 byte
4729     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4730         clampIdx!(0, 4),
4731         clampIdx!(1, 6),
4732         clampIdx!(2, 6)
4733     );
4734     //ditto for 4 byte
4735     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4736         clampIdx!(0, 3), clampIdx!(1, 6),
4737         clampIdx!(2, 6), clampIdx!(3, 6)
4738     );
4739     alias Tables = AliasSeq!(
4740         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4741         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4742         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4743         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4744     );
4745     alias Table(int size) = Tables[size-1];
4746 
4747     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4748     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4749 
4750     char truncate()(char ch) pure @safe
4751     {
4752         ch -= 0x80;
4753         if (ch < 0x40)
4754         {
4755             return ch;
4756         }
4757         else
4758         {
4759             badEncoding();
4760             return cast(char) 0;
4761         }
4762     }
4763 
4764     static auto encode(size_t sz)(dchar ch)
4765         if (sz > 1)
4766     {
4767         import std.utf : encodeUTF = encode;
4768         char[4] buf;
4769         encodeUTF(buf, ch);
4770         char[sz] ret;
4771         buf[0] &= leadMask!sz;
4772         foreach (n; 1 .. sz)
4773             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4774         ret[] = buf[0 .. sz];
4775         return ret;
4776     }
4777 
4778     auto build(Set)(Set set)
4779     {
4780         import std.algorithm.iteration : map;
4781         auto ascii = set & unicode.ASCII;
4782         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4783         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4784         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4785         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4786         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4787         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4788         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4789         alias Ret = Impl!(1,2,3,4);
4790         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4791     }
4792 
4793     // Bootstrap UTF-8 static matcher interface
4794     // from 3 primitives: tab!(size), lookup and Sizes
4795     mixin template DefMatcher()
4796     {
4797         import std.format : format;
4798         import std.meta : Erase, staticIndexOf;
4799         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4800         alias UniSizes = Erase!(1, Sizes);
4801 
4802         //generate dispatch code sequence for unicode parts
4803         static auto genDispatch()
4804         {
4805             string code;
4806             foreach (size; UniSizes)
4807                 code ~= format(q{
4808                     if ((ch & ~leadMask!%d) == encMask!(%d))
4809                         return lookup!(%d, mode)(inp);
4810                     else
4811                 }, size, size, size);
4812             static if (Sizes.length == 4) //covers all code unit cases
4813                 code ~= "{ badEncoding(); return false; }";
4814             else
4815                 code ~= "return false;"; //may be just fine but not covered
4816             return code;
4817         }
4818         enum dispatch = genDispatch();
4819 
4820         public bool match(Range)(ref Range inp) const
4821             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4822                 !isDynamicArray!Range)
4823         {
4824             enum mode = Mode.skipOnMatch;
4825             assert(!inp.empty);
4826             immutable ch = inp[0];
4827             static if (hasASCII)
4828             {
4829                 if (ch < 0x80)
4830                 {
4831                     immutable r = tab!1[ch];
4832                     if (r)
4833                         inp.popFront();
4834                     return r;
4835                 }
4836                 else
4837                     mixin(dispatch);
4838             }
4839             else
4840                 mixin(dispatch);
4841         }
4842 
4843         static if (Sizes.length == 4) // can skip iff can detect all encodings
4844         {
4845             public bool skip(Range)(ref Range inp) const
4846                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4847                     !isDynamicArray!Range)
4848             {
4849                 enum mode = Mode.alwaysSkip;
4850                 assert(!inp.empty);
4851                 auto ch = inp[0];
4852                 static if (hasASCII)
4853                 {
4854                     if (ch < 0x80)
4855                     {
4856                         inp.popFront();
4857                         return tab!1[ch];
4858                     }
4859                     else
4860                         mixin(dispatch);
4861                 }
4862                 else
4863                     mixin(dispatch);
4864             }
4865         }
4866 
4867         public bool test(Range)(ref Range inp) const
4868             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4869                 !isDynamicArray!Range)
4870         {
4871             enum mode = Mode.neverSkip;
4872             assert(!inp.empty);
4873             auto ch = inp[0];
4874             static if (hasASCII)
4875             {
4876                 if (ch < 0x80)
4877                     return tab!1[ch];
4878                 else
4879                     mixin(dispatch);
4880             }
4881             else
4882                 mixin(dispatch);
4883         }
4884 
4885         bool match(C)(ref C[] str) const
4886             if (isSomeChar!C)
4887         {
4888             return fwdStr!"match"(str);
4889         }
4890 
4891         bool skip(C)(ref C[] str) const
4892             if (isSomeChar!C)
4893         {
4894             return fwdStr!"skip"(str);
4895         }
4896 
4897         bool test(C)(ref C[] str) const
4898             if (isSomeChar!C)
4899         {
4900             return fwdStr!"test"(str);
4901         }
4902 
4903         mixin ForwardStrings;
4904     }
4905 
4906     struct Impl(Sizes...)
4907     {
4908         import std.meta : allSatisfy, staticMap;
4909         static assert(allSatisfy!(validSize, Sizes),
4910             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4911     private:
4912         //pick tables for chosen sizes
4913         alias OurTabs = staticMap!(Table, Sizes);
4914         OurTabs tables;
4915         mixin DefMatcher;
4916         //static disptach helper UTF size ==> table
4917         alias tab(int i) = tables[i - 1];
4918 
4919         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4920         {
4921             return CherryPick!(Impl, SizesToPick)(&this);
4922         }
4923 
4924         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4925         {
4926             import std.range : popFrontN;
4927             if (inp.length < size)
4928             {
4929                 badEncoding();
4930                 return false;
4931             }
4932             char[size] needle = void;
4933             needle[0] = leadMask!size & inp[0];
4934             static foreach (i; 1 .. size)
4935             {
4936                 needle[i] = truncate(inp[i]);
4937             }
4938             //overlong encoding checks
4939             static if (size == 2)
4940             {
4941                 //0x80-0x7FF
4942                 //got 6 bits in needle[1], must use at least 8 bits
4943                 //must use at least 2 bits in needle[1]
4944                 if (needle[0] < 2) badEncoding();
4945             }
4946             else static if (size == 3)
4947             {
4948                 //0x800-0xFFFF
4949                 //got 6 bits in needle[2], must use at least 12bits
4950                 //must use 6 bits in needle[1] or anything in needle[0]
4951                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4952             }
4953             else static if (size == 4)
4954             {
4955                 //0x800-0xFFFF
4956                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4957                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4958                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4959             }
4960             static if (mode == Mode.alwaysSkip)
4961             {
4962                 inp.popFrontN(size);
4963                 return tab!size[needle];
4964             }
4965             else static if (mode == Mode.neverSkip)
4966             {
4967                 return tab!size[needle];
4968             }
4969             else
4970             {
4971                 static assert(mode == Mode.skipOnMatch);
4972                 if (tab!size[needle])
4973                 {
4974                     inp.popFrontN(size);
4975                     return true;
4976                 }
4977                 else
4978                     return false;
4979             }
4980         }
4981     }
4982 
4983     struct CherryPick(I, Sizes...)
4984     {
4985         import std.meta : allSatisfy;
4986         static assert(allSatisfy!(validSize, Sizes),
4987             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4988     private:
4989         I* m;
4990         @property auto tab(int i)() const { return m.tables[i - 1]; }
4991         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4992         {
4993             return m.lookup!(size, mode)(inp);
4994         }
4995         mixin DefMatcher;
4996     }
4997 }
4998 
4999 template Utf16Matcher()
5000 {
5001     enum validSize(int sz) = sz >= 1 && sz <= 2;
5002 
5003     void badEncoding() pure @safe
5004     {
5005         import std.utf : UTFException;
5006         throw new UTFException("Invalid UTF-16 sequence");
5007     }
5008 
5009     // 1-stage ASCII
5010     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5011     //2-stage BMP
5012     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5013     //4-stage - full Unicode
5014     //assume that 0xD800 & 0xDC00 bits are cleared
5015     //thus leaving 10 bit per wchar to worry about
5016     alias UniSpec = AliasSeq!(bool, wchar[2],
5017         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5018         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5019     );
5020     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5021     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5022     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5023 
5024     auto encode2(dchar ch)
5025     {
5026         ch -= 0x1_0000;
5027         assert(ch <= 0xF_FFFF);
5028         wchar[2] ret;
5029         //do not put surrogate bits, they are sliced off
5030         ret[0] = cast(wchar)(ch >> 10);
5031         ret[1] = (ch & 0xFFF);
5032         return ret;
5033     }
5034 
5035     auto build(Set)(Set set)
5036     {
5037         import std.algorithm.iteration : map;
5038         auto ascii = set & unicode.ASCII;
5039         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5040             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5041         auto other = set - (bmp | ascii);
5042         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5043         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5044         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5045         alias Ret = Impl!(1,2);
5046         return Ret(asciiT, bmpT, otherT);
5047     }
5048 
5049     //bootstrap full UTF-16 matcher interace from
5050     //sizeFlags, lookupUni and ascii
5051     mixin template DefMatcher()
5052     {
5053         public bool match(Range)(ref Range inp) const
5054             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5055                 !isDynamicArray!Range)
5056         {
5057             enum mode = Mode.skipOnMatch;
5058             assert(!inp.empty);
5059             immutable ch = inp[0];
5060             static if (sizeFlags & 1)
5061             {
5062                 if (ch < 0x80)
5063                 {
5064                   if (ascii[ch])
5065                   {
5066                       inp.popFront();
5067                       return true;
5068                   }
5069                   else
5070                       return false;
5071                 }
5072                 return lookupUni!mode(inp);
5073             }
5074             else
5075                 return lookupUni!mode(inp);
5076         }
5077 
5078         static if (Sizes.length == 2)
5079         {
5080             public bool skip(Range)(ref Range inp) const
5081                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5082                     !isDynamicArray!Range)
5083             {
5084                 enum mode = Mode.alwaysSkip;
5085                 assert(!inp.empty);
5086                 immutable ch = inp[0];
5087                 static if (sizeFlags & 1)
5088                 {
5089                     if (ch < 0x80)
5090                     {
5091                         inp.popFront();
5092                         return ascii[ch];
5093                     }
5094                     else
5095                         return lookupUni!mode(inp);
5096                 }
5097                 else
5098                     return lookupUni!mode(inp);
5099             }
5100         }
5101 
5102         public bool test(Range)(ref Range inp) const
5103             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5104                 !isDynamicArray!Range)
5105         {
5106             enum mode = Mode.neverSkip;
5107             assert(!inp.empty);
5108             auto ch = inp[0];
5109             static if (sizeFlags & 1)
5110                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5111             else
5112                 return lookupUni!mode(inp);
5113         }
5114 
5115         bool match(C)(ref C[] str) const
5116             if (isSomeChar!C)
5117         {
5118             return fwdStr!"match"(str);
5119         }
5120 
5121         bool skip(C)(ref C[] str) const
5122             if (isSomeChar!C)
5123         {
5124             return fwdStr!"skip"(str);
5125         }
5126 
5127         bool test(C)(ref C[] str) const
5128             if (isSomeChar!C)
5129         {
5130             return fwdStr!"test"(str);
5131         }
5132 
5133         mixin ForwardStrings; //dispatch strings to range versions
5134     }
5135 
5136     struct Impl(Sizes...)
5137         if (Sizes.length >= 1 && Sizes.length <= 2)
5138     {
5139     private:
5140         import std.meta : allSatisfy;
5141         static assert(allSatisfy!(validSize, Sizes),
5142             "Only lengths of 1 and 2 code units are possible in UTF-16");
5143         static if (Sizes.length > 1)
5144             enum sizeFlags = Sizes[0] | Sizes[1];
5145         else
5146             enum sizeFlags = Sizes[0];
5147 
5148         static if (sizeFlags & 1)
5149         {
5150             Ascii ascii;
5151             Bmp bmp;
5152         }
5153         static if (sizeFlags & 2)
5154         {
5155             Uni uni;
5156         }
5157         mixin DefMatcher;
5158 
5159         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5160         {
5161             return CherryPick!(Impl, SizesToPick)(&this);
5162         }
5163 
5164         bool lookupUni(Mode mode, Range)(ref Range inp) const
5165         {
5166             wchar x = cast(wchar)(inp[0] - 0xD800);
5167             //not a high surrogate
5168             if (x > 0x3FF)
5169             {
5170                 //low surrogate
5171                 if (x <= 0x7FF) badEncoding();
5172                 static if (sizeFlags & 1)
5173                 {
5174                     auto ch = inp[0];
5175                     static if (mode == Mode.alwaysSkip)
5176                         inp.popFront();
5177                     static if (mode == Mode.skipOnMatch)
5178                     {
5179                         if (bmp[ch])
5180                         {
5181                             inp.popFront();
5182                             return true;
5183                         }
5184                         else
5185                             return false;
5186                     }
5187                     else
5188                         return bmp[ch];
5189                 }
5190                 else //skip is not available for sub-matchers, so just false
5191                     return false;
5192             }
5193             else
5194             {
5195                 import std.range : popFrontN;
5196                 static if (sizeFlags & 2)
5197                 {
5198                     if (inp.length < 2)
5199                         badEncoding();
5200                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5201                     //not a low surrogate
5202                     if (y > 0x3FF)
5203                         badEncoding();
5204                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5205                     static if (mode == Mode.alwaysSkip)
5206                         inp.popFrontN(2);
5207                     static if (mode == Mode.skipOnMatch)
5208                     {
5209                         if (uni[needle])
5210                         {
5211                             inp.popFrontN(2);
5212                             return true;
5213                         }
5214                         else
5215                             return false;
5216                     }
5217                     else
5218                         return uni[needle];
5219                 }
5220                 else //ditto
5221                     return false;
5222             }
5223         }
5224     }
5225 
5226     struct CherryPick(I, Sizes...)
5227         if (Sizes.length >= 1 && Sizes.length <= 2)
5228     {
5229     private:
5230         import std.meta : allSatisfy;
5231         I* m;
5232         enum sizeFlags = I.sizeFlags;
5233 
5234         static if (sizeFlags & 1)
5235         {
5236             @property auto ascii()() const { return m.ascii; }
5237         }
5238 
5239         bool lookupUni(Mode mode, Range)(ref Range inp) const
5240         {
5241             return m.lookupUni!mode(inp);
5242         }
5243         mixin DefMatcher;
5244         static assert(allSatisfy!(validSize, Sizes),
5245             "Only lengths of 1 and 2 code units are possible in UTF-16");
5246     }
5247 }
5248 
5249 private auto utf8Matcher(Set)(Set set)
5250 {
5251     return Utf8Matcher!().build(set);
5252 }
5253 
5254 private auto utf16Matcher(Set)(Set set)
5255 {
5256     return Utf16Matcher!().build(set);
5257 }
5258 
5259 /**
5260     Constructs a matcher object
5261     to classify $(CODEPOINTS) from the `set` for encoding
5262     that has `Char` as code unit.
5263 
5264     See $(LREF MatcherConcept) for API outline.
5265 */
5266 public auto utfMatcher(Char, Set)(Set set)
5267 if (isCodepointSet!Set)
5268 {
5269     static if (is(Char : char))
5270         return utf8Matcher(set);
5271     else static if (is(Char : wchar))
5272         return utf16Matcher(set);
5273     else static if (is(Char : dchar))
5274         static assert(false, "UTF-32 needs no decoding,
5275             and thus not supported by utfMatcher");
5276     else
5277         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5278 }
5279 
5280 
5281 //a range of code units, packed with index to speed up forward iteration
5282 package(std) auto decoder(C)(C[] s, size_t offset=0)
5283 if (is(C : wchar) || is(C : char))
5284 {
5285     static struct Decoder
5286     {
5287     pure nothrow:
5288         C[] str;
5289         size_t idx;
5290         @property C front(){ return str[idx]; }
5291         @property C back(){ return str[$-1]; }
5292         void popFront(){ idx++; }
5293         void popBack(){ str = str[0..$-1]; }
5294         void popFrontN(size_t n){ idx += n; }
5295         @property bool empty(){ return idx == str.length; }
5296         @property auto save(){ return this; }
5297         auto opIndex(size_t i){ return str[idx+i]; }
5298         @property size_t length(){ return str.length - idx; }
5299         alias opDollar = length;
5300         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5301     }
5302     static assert(isRandomAccessRange!Decoder);
5303     static assert(is(ElementType!Decoder : C));
5304     return Decoder(s, offset);
5305 }
5306 
5307 pure @safe unittest
5308 {
5309     string rs = "hi! ﾈемног砀 текста";
5310     auto codec = rs.decoder;
5311     auto utf8 =  utf8Matcher(unicode.Letter);
5312     auto asc = utf8.subMatcher!(1);
5313     auto uni = utf8.subMatcher!(2,3,4);
5314     assert(asc.test(codec));
5315     assert(!uni.match(codec));
5316     assert(utf8.skip(codec));
5317     assert(codec.idx == 1);
5318 
5319     assert(!uni.match(codec));
5320     assert(asc.test(codec));
5321     assert(utf8.skip(codec));
5322     assert(codec.idx == 2);
5323     assert(!asc.match(codec));
5324 
5325     assert(!utf8.test(codec));
5326     assert(!utf8.skip(codec));
5327 
5328     assert(!asc.test(codec));
5329     assert(!utf8.test(codec));
5330     assert(!utf8.skip(codec));
5331     assert(utf8.test(codec));
5332     foreach (i; 0 .. 7)
5333     {
5334         assert(!asc.test(codec));
5335         assert(uni.test(codec));
5336         assert(utf8.skip(codec));
5337     }
5338     assert(!utf8.test(codec));
5339     assert(!utf8.skip(codec));
5340     //the same with match where applicable
5341     codec = rs.decoder;
5342     assert(utf8.match(codec));
5343     assert(codec.idx == 1);
5344     assert(utf8.match(codec));
5345     assert(codec.idx == 2);
5346     assert(!utf8.match(codec));
5347     assert(codec.idx == 2);
5348     assert(!utf8.skip(codec));
5349     assert(!utf8.skip(codec));
5350 
5351     foreach (i; 0 .. 7)
5352     {
5353         assert(!asc.test(codec));
5354         assert(utf8.test(codec));
5355         assert(utf8.match(codec));
5356     }
5357     auto i = codec.idx;
5358     assert(!utf8.match(codec));
5359     assert(codec.idx == i);
5360 }
5361 
5362 pure @safe unittest
5363 {
5364     import std.range : stride;
5365     static bool testAll(Matcher, Range)(scope ref Matcher m, ref Range r)
5366     {
5367         bool t = m.test(r);
5368         auto save = r.idx;
5369         assert(t == m.match(r));
5370         assert(r.idx == save || t); //ether no change or was match
5371         r.idx = save;
5372         static if (is(typeof(m.skip(r))))
5373         {
5374             assert(t == m.skip(r));
5375             assert(r.idx != save); //always changed
5376             r.idx = save;
5377         }
5378         return t;
5379     }
5380     auto utf16 = utfMatcher!wchar(unicode.L);
5381     auto bmp = utf16.subMatcher!1;
5382     auto nonBmp = utf16.subMatcher!1;
5383     auto utf8 = utfMatcher!char(unicode.L);
5384     auto ascii = utf8.subMatcher!1;
5385     auto uni2 = utf8.subMatcher!2;
5386     auto uni3 = utf8.subMatcher!3;
5387     auto uni24 = utf8.subMatcher!(2,4);
5388     foreach (ch; unicode.L.byCodepoint.stride(3))
5389     {
5390         import std.utf : encode;
5391         char[4] buf;
5392         wchar[2] buf16;
5393         auto len = encode(buf, ch);
5394         auto len16 = encode(buf16, ch);
5395         auto c8 = buf[0 .. len].decoder;
5396         auto c16 = buf16[0 .. len16].decoder;
5397         assert(testAll(utf16, c16));
5398         assert(testAll(bmp, c16) || len16 != 1);
5399         assert(testAll(nonBmp, c16) || len16 != 2);
5400 
5401         assert(testAll(utf8, c8));
5402 
5403         //submatchers return false on out of their domain
5404         assert(testAll(ascii, c8) || len != 1);
5405         assert(testAll(uni2, c8) || len != 2);
5406         assert(testAll(uni3, c8) || len != 3);
5407         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5408     }
5409 }
5410 
5411 // cover decode fail cases of Matcher
5412 pure @system unittest
5413 {
5414     import std.algorithm.iteration : map;
5415     import std.exception : collectException;
5416     import std.format : format;
5417     auto utf16 = utfMatcher!wchar(unicode.L);
5418     auto utf8 = utfMatcher!char(unicode.L);
5419     //decode failure cases UTF-8
5420     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5421         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5422         "\xCF\x00\0x00\0x00\x00");
5423     foreach (msg; fails8)
5424     {
5425         assert(collectException((){
5426             auto s = msg;
5427             size_t idx = 0;
5428             utf8.test(s);
5429         }()), format("%( %2x %)", cast(ubyte[]) msg));
5430     }
5431     //decode failure cases UTF-16
5432     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5433     foreach (msg; fails16)
5434     {
5435         assert(collectException((){
5436             auto s = msg.map!(x => cast(wchar) x);
5437             utf16.test(s);
5438         }()));
5439     }
5440 }
5441 
5442 /++
5443     Convenience function to construct optimal configurations for
5444     packed Trie from any `set` of $(CODEPOINTS).
5445 
5446     The parameter `level` indicates the number of trie levels to use,
5447     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5448     speed-size wise.
5449 
5450     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5451     $(P Level 4 is the slowest and has the smallest footprint. )
5452 
5453     See the $(S_LINK Synopsis, Synopsis) section for example.
5454 
5455     Note:
5456     Level 4 stays very practical (being faster and more predictable)
5457     compared to using direct lookup on the `set` itself.
5458 
5459 
5460 +/
5461 public auto toTrie(size_t level, Set)(Set set)
5462 if (isCodepointSet!Set)
5463 {
5464     static if (level == 1)
5465         return codepointSetTrie!(21)(set);
5466     else static if (level == 2)
5467         return codepointSetTrie!(10, 11)(set);
5468     else static if (level == 3)
5469         return codepointSetTrie!(8, 5, 8)(set);
5470     else static if (level == 4)
5471          return codepointSetTrie!(6, 4, 4, 7)(set);
5472     else
5473         static assert(false,
5474             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5475 }
5476 
5477 /**
5478     $(P Builds a `Trie` with typically optimal speed-size trade-off
5479     and wraps it into a delegate of the following type:
5480     $(D bool delegate(dchar ch)). )
5481 
5482     $(P Effectively this creates a 'tester' lambda suitable
5483     for algorithms like std.algorithm.find that take unary predicates. )
5484 
5485     See the $(S_LINK Synopsis, Synopsis) section for example.
5486 */
5487 public auto toDelegate(Set)(Set set)
5488 if (isCodepointSet!Set)
5489 {
5490     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5491     auto t = toTrie!3(set);
5492     return (dchar ch) => t[ch];
5493 }
5494 
5495 /**
5496     $(P Opaque wrapper around unsigned built-in integers and
5497     code unit (char/wchar/dchar) types.
5498     Parameter `sz` indicates that the value is confined
5499     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5500     packed more tightly when stored in certain
5501     data-structures like trie. )
5502 
5503     Note:
5504     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5505     but not vise-versa. Users have to ensure the value fits in
5506     the range required and use the `cast`
5507     operator to perform the conversion.)
5508 */
5509 struct BitPacked(T, size_t sz)
5510 if (isIntegral!T || is(T:dchar))
5511 {
5512     enum bitSize = sz;
5513     T _value;
5514     alias _value this;
5515 }
5516 
5517 /*
5518     Depending on the form of the passed argument `bitSizeOf` returns
5519     the amount of bits required to represent a given type
5520     or a return type of a given functor.
5521 */
5522 template bitSizeOf(Args...)
5523 if (Args.length == 1)
5524 {
5525     import std.traits : ReturnType;
5526     alias T = Args[0];
5527     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5528     {
5529         enum bitSizeOf = T.bitSize;
5530     }
5531     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5532     {
5533         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5534     }
5535     else
5536     {
5537         enum bitSizeOf = T.sizeof*8;
5538     }
5539 }
5540 
5541 /**
5542     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5543     and thus suitable for packing.
5544 */
5545 template isBitPacked(T)
5546 {
5547     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5548         enum isBitPacked = true;
5549     else
5550         enum isBitPacked = false;
5551 }
5552 
5553 /**
5554     Gives the type `U` from $(LREF BitPacked)!(U, x)
5555     or `T` itself for every other type.
5556 */
5557 template TypeOfBitPacked(T)
5558 {
5559     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5560         alias TypeOfBitPacked = U;
5561     else
5562         alias TypeOfBitPacked = T;
5563 }
5564 
5565 /*
5566     Wrapper, used in definition of custom data structures from `Trie` template.
5567     Applying it to a unary lambda function indicates that the returned value always
5568     fits within `bits` of bits.
5569 */
5570 struct assumeSize(alias Fn, size_t bits)
5571 {
5572     enum bitSize = bits;
5573     static auto ref opCall(T)(auto ref T arg)
5574     {
5575         return Fn(arg);
5576     }
5577 }
5578 
5579 /*
5580     A helper for defining lambda function that yields a slice
5581     of certain bits from an unsigned integral value.
5582     The resulting lambda is wrapped in assumeSize and can be used directly
5583     with `Trie` template.
5584 */
5585 struct sliceBits(size_t from, size_t to)
5586 {
5587     //for now bypass assumeSize, DMD has trouble inlining it
5588     enum bitSize = to-from;
5589     static auto opCall(T)(T x)
5590     out(result)
5591     {
5592         assert(result < (1 << to-from));
5593     }
5594     do
5595     {
5596         static assert(from < to);
5597         static if (from == 0)
5598             return x & ((1 << to)-1);
5599         else
5600         return (x >> from) & ((1<<(to-from))-1);
5601     }
5602 }
5603 
5604 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5605 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5606 alias lo8 = assumeSize!(low_8, 8);
5607 alias mlo8 = assumeSize!(midlow_8, 8);
5608 
5609 @safe pure nothrow @nogc unittest
5610 {
5611     static assert(bitSizeOf!lo8 == 8);
5612     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5613     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5614 }
5615 
5616 template Sequence(size_t start, size_t end)
5617 {
5618     static if (start < end)
5619         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5620     else
5621         alias Sequence = AliasSeq!();
5622 }
5623 
5624 //---- TRIE TESTS ----
5625 @system unittest
5626 {
5627     import std.algorithm.iteration : map;
5628     import std.algorithm.sorting : sort;
5629     import std.array : array;
5630     import std.conv : text, to;
5631     import std.range : iota;
5632     static trieStats(TRIE)(TRIE t)
5633     {
5634         version (std_uni_stats)
5635         {
5636             import std.stdio : writefln, writeln;
5637             writeln("---TRIE FOOTPRINT STATS---");
5638             static foreach (i; 0 .. t.table.dim)
5639             {
5640                 writefln("lvl%s = %s bytes;  %s pages"
5641                          , i, t.bytes!i, t.pages!i);
5642             }
5643             writefln("TOTAL: %s bytes", t.bytes);
5644             version (none)
5645             {
5646                 writeln("INDEX (excluding value level):");
5647                 static foreach (i; 0 .. t.table.dim-1)
5648                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5649             }
5650             writeln("---------------------------");
5651         }
5652     }
5653     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5654     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5655     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5656     alias Set = CodepointSet;
5657     auto set = Set('A','Z','a','z');
5658     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5659     for (int a='a'; a<'z';a++)
5660         assert(trie[a]);
5661     for (int a='A'; a<'Z';a++)
5662         assert(trie[a]);
5663     for (int a=0; a<'A'; a++)
5664         assert(!trie[a]);
5665     for (int a ='Z'; a<'a'; a++)
5666         assert(!trie[a]);
5667     trieStats(trie);
5668 
5669     auto redundant2 = Set(
5670         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5671     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5672     trieStats(trie2);
5673     foreach (e; redundant2.byCodepoint)
5674         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5675     foreach (i; 0 .. 1024)
5676     {
5677         assert(trie2[i] == (i in redundant2));
5678     }
5679 
5680 
5681     auto redundant3 = Set(
5682           2,    4,    6,    8,    16,
5683        2+16, 4+16, 16+6, 16+8, 16+16,
5684        2+32, 4+32, 32+6, 32+8,
5685       );
5686 
5687     enum max3 = 256;
5688     // sliceBits
5689     auto trie3 = buildTrie!(bool, uint, max3,
5690             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5691         )(redundant3.byInterval);
5692     trieStats(trie3);
5693     foreach (i; 0 .. max3)
5694         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5695 
5696     auto redundant4 = Set(
5697             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5698             1000, 2000, 3000, 4000, 5000, 6000
5699         );
5700     enum max4 = 2^^16;
5701     auto trie4 = buildTrie!(bool, size_t, max4,
5702             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5703         )(redundant4.byInterval);
5704     foreach (i; 0 .. max4)
5705     {
5706         if (i in redundant4)
5707             assert(trie4[i], text(cast(uint) i));
5708     }
5709     trieStats(trie4);
5710 
5711         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5712         string[] redundantS = ["tea", "start", "orange"];
5713         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5714         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5715         // using first char only
5716         assert(redundantS == ["orange", "start", "tea"]);
5717         assert(strie["test"], text(strie["test"]));
5718         assert(!strie["aea"]);
5719         assert(strie["s"]);
5720 
5721     // a bit size test
5722     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5723     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5724     trieStats(bt);
5725     foreach (i; 0 .. 256)
5726         assert(bt[cast(ubyte) i]);
5727 }
5728 
5729 template useItemAt(size_t idx, T)
5730 if (isIntegral!T || is(T: dchar))
5731 {
5732     size_t impl(const scope T[] arr){ return arr[idx]; }
5733     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5734 }
5735 
5736 template useLastItem(T)
5737 {
5738     size_t impl(const scope T[] arr){ return arr[$-1]; }
5739     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5740 }
5741 
5742 template fullBitSize(Prefix...)
5743 {
5744     static if (Prefix.length > 0)
5745         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5746     else
5747         enum fullBitSize = 0;
5748 }
5749 
5750 template idxTypes(Key, size_t fullBits, Prefix...)
5751 {
5752     static if (Prefix.length == 1)
5753     {// the last level is value level, so no index once reduced to 1-level
5754         alias idxTypes = AliasSeq!();
5755     }
5756     else
5757     {
5758         // Important note on bit packing
5759         // Each level has to hold enough of bits to address the next one
5760         // The bottom level is known to hold full bit width
5761         // thus it's size in pages is full_bit_width - size_of_last_prefix
5762         // Recourse on this notion
5763         alias idxTypes =
5764             AliasSeq!(
5765                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5766                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5767             );
5768     }
5769 }
5770 
5771 //============================================================================
5772 
5773 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5774 if (is(Char1 : dchar) && is(Char2 : dchar))
5775 {
5776     import std.algorithm.comparison : cmp;
5777     import std.algorithm.iteration : map, filter;
5778     import std.ascii : toLower;
5779     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5780     return cmp(
5781         a.map!toLower.filter!pred,
5782         b.map!toLower.filter!pred);
5783 }
5784 
5785 @safe pure unittest
5786 {
5787     assert(!comparePropertyName("foo-bar", "fooBar"));
5788 }
5789 
5790 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5791 if (is(Char1 : dchar) && is(Char2 : dchar))
5792 {
5793     return comparePropertyName(a, b) < 0;
5794 }
5795 
5796 //============================================================================
5797 // Utilities for compression of Unicode code point sets
5798 //============================================================================
5799 
5800 @safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
5801 {
5802     // not optimized as usually done 1 time (and not public interface)
5803     if (val < 128)
5804         arr ~= cast(ubyte) val;
5805     else if (val < (1 << 13))
5806     {
5807         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5808         arr ~= val & 0xFF;
5809     }
5810     else
5811     {
5812         assert(val < (1 << 21));
5813         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5814         arr ~= (val >> 8) & 0xFF;
5815         arr ~= val  & 0xFF;
5816     }
5817 }
5818 
5819 @safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
5820 {
5821     import std.exception : enforce;
5822     immutable first = arr[idx++];
5823     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5824         return first;
5825     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5826     uint val = (first & 0x1F);
5827     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5828     foreach (j; 0 .. extra)
5829         val = (val << 8) | arr[idx+j];
5830     idx += extra;
5831     return val;
5832 }
5833 
5834 
5835 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5836 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5837 {
5838     ubyte[] storage;
5839     uint base = 0;
5840     // RLE encode
5841     foreach (val; intervals)
5842     {
5843         compressTo(val[0]-base, storage);
5844         base = val[0];
5845         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5846         {
5847             compressTo(val[1]-base, storage);
5848             base = val[1];
5849         }
5850     }
5851     return storage;
5852 }
5853 
5854 @safe pure unittest
5855 {
5856     import std.algorithm.comparison : equal;
5857     import std.typecons : tuple;
5858 
5859     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5860     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5861     assert(compressIntervals(run) == enc);
5862     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5863     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5864     assert(compressIntervals(run2) == enc2);
5865     size_t  idx = 0;
5866     assert(decompressFrom(enc, idx) == 80);
5867     assert(decompressFrom(enc, idx) == 47);
5868     assert(decompressFrom(enc, idx) == 1);
5869     assert(decompressFrom(enc, idx) == (1 << 10));
5870     idx = 0;
5871     assert(decompressFrom(enc2, idx) == 0);
5872     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5873     assert(equal(decompressIntervals(compressIntervals(run)), run));
5874     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5875 }
5876 
5877 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5878 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5879 {
5880     return DecompressedIntervals(data);
5881 }
5882 
5883 @safe struct DecompressedIntervals
5884 {
5885 pure:
5886     const(ubyte)[] _stream;
5887     size_t _idx;
5888     CodepointInterval _front;
5889 
5890     this(const(ubyte)[] stream)
5891     {
5892         _stream = stream;
5893         popFront();
5894     }
5895 
5896     @property CodepointInterval front()
5897     {
5898         assert(!empty);
5899         return _front;
5900     }
5901 
5902     void popFront()
5903     {
5904         if (_idx == _stream.length)
5905         {
5906             _idx = size_t.max;
5907             return;
5908         }
5909         uint base = _front[1];
5910         _front[0] = base + decompressFrom(_stream, _idx);
5911         if (_idx == _stream.length)// odd length ---> till the end
5912             _front[1] = lastDchar+1;
5913         else
5914         {
5915             base = _front[0];
5916             _front[1] = base + decompressFrom(_stream, _idx);
5917         }
5918     }
5919 
5920     @property bool empty() const
5921     {
5922         return _idx == size_t.max;
5923     }
5924 
5925     @property DecompressedIntervals save() { return this; }
5926 }
5927 
5928 @safe pure nothrow @nogc unittest
5929 {
5930     static assert(isInputRange!DecompressedIntervals);
5931     static assert(isForwardRange!DecompressedIntervals);
5932 }
5933 
5934 //============================================================================
5935 
5936 version (std_uni_bootstrap){}
5937 else
5938 {
5939 
5940 // helper for looking up code point sets
5941 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5942 {
5943     import std.algorithm.iteration : map;
5944     import std.range : assumeSorted;
5945     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5946         (table.map!"a.name"());
5947     size_t idx = range.lowerBound(name).length;
5948     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5949         return idx;
5950     return -1;
5951 }
5952 
5953 // another one that loads it
5954 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5955 {
5956     auto idx = findUnicodeSet!table(name);
5957     if (idx >= 0)
5958     {
5959         dest = Set(asSet(table[idx].compressed));
5960         return true;
5961     }
5962     return false;
5963 }
5964 
5965 bool loadProperty(Set=CodepointSet, C)
5966     (const scope C[] name, ref Set target) pure
5967 {
5968     import std.internal.unicode_tables : uniProps; // generated file
5969     alias ucmp = comparePropertyName;
5970     // conjure cumulative properties by hand
5971     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5972     {
5973         target = asSet(uniProps.Lu);
5974         target |= asSet(uniProps.Ll);
5975         target |= asSet(uniProps.Lt);
5976         target |= asSet(uniProps.Lo);
5977         target |= asSet(uniProps.Lm);
5978     }
5979     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5980     {
5981         target = asSet(uniProps.Ll);
5982         target |= asSet(uniProps.Lu);
5983         target |= asSet(uniProps.Lt);// Title case
5984     }
5985     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5986     {
5987         target = asSet(uniProps.Mn);
5988         target |= asSet(uniProps.Mc);
5989         target |= asSet(uniProps.Me);
5990     }
5991     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5992     {
5993         target = asSet(uniProps.Nd);
5994         target |= asSet(uniProps.Nl);
5995         target |= asSet(uniProps.No);
5996     }
5997     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5998     {
5999         target = asSet(uniProps.Pc);
6000         target |= asSet(uniProps.Pd);
6001         target |= asSet(uniProps.Ps);
6002         target |= asSet(uniProps.Pe);
6003         target |= asSet(uniProps.Pi);
6004         target |= asSet(uniProps.Pf);
6005         target |= asSet(uniProps.Po);
6006     }
6007     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6008     {
6009         target = asSet(uniProps.Sm);
6010         target |= asSet(uniProps.Sc);
6011         target |= asSet(uniProps.Sk);
6012         target |= asSet(uniProps.So);
6013     }
6014     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6015     {
6016         target = asSet(uniProps.Zs);
6017         target |= asSet(uniProps.Zl);
6018         target |= asSet(uniProps.Zp);
6019     }
6020     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6021     {
6022         target = asSet(uniProps.Co);
6023         target |= asSet(uniProps.Lo);
6024         target |= asSet(uniProps.No);
6025         target |= asSet(uniProps.So);
6026         target |= asSet(uniProps.Po);
6027     }
6028     else if (ucmp(name, "graphical") == 0)
6029     {
6030         target = asSet(uniProps.Alphabetic);
6031 
6032         target |= asSet(uniProps.Mn);
6033         target |= asSet(uniProps.Mc);
6034         target |= asSet(uniProps.Me);
6035 
6036         target |= asSet(uniProps.Nd);
6037         target |= asSet(uniProps.Nl);
6038         target |= asSet(uniProps.No);
6039 
6040         target |= asSet(uniProps.Pc);
6041         target |= asSet(uniProps.Pd);
6042         target |= asSet(uniProps.Ps);
6043         target |= asSet(uniProps.Pe);
6044         target |= asSet(uniProps.Pi);
6045         target |= asSet(uniProps.Pf);
6046         target |= asSet(uniProps.Po);
6047 
6048         target |= asSet(uniProps.Zs);
6049 
6050         target |= asSet(uniProps.Sm);
6051         target |= asSet(uniProps.Sc);
6052         target |= asSet(uniProps.Sk);
6053         target |= asSet(uniProps.So);
6054     }
6055     else if (ucmp(name, "any") == 0)
6056         target = Set.fromIntervals(0, 0x110000);
6057     else if (ucmp(name, "ascii") == 0)
6058         target = Set.fromIntervals(0, 0x80);
6059     else
6060         return loadUnicodeSet!(uniProps.tab)(name, target);
6061     return true;
6062 }
6063 
6064 // CTFE-only helper for checking property names at compile-time
6065 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6066 {
6067     import std.algorithm.searching : find;
6068     auto names = [
6069         "L", "Letter",
6070         "LC", "Cased Letter",
6071         "M", "Mark",
6072         "N", "Number",
6073         "P", "Punctuation",
6074         "S", "Symbol",
6075         "Z", "Separator",
6076         "Graphical",
6077         "any",
6078         "ascii"
6079     ];
6080     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6081     return !x.empty;
6082 }
6083 
6084 // ditto, CTFE-only, not optimized
6085 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6086 {
6087     return findUnicodeSet!table(name) >= 0;
6088 }
6089 
6090 template SetSearcher(alias table, string kind)
6091 {
6092     /// Run-time checked search.
6093     static auto opCall(C)(const scope C[] name)
6094         if (is(C : dchar))
6095     {
6096         import std.conv : to;
6097         CodepointSet set;
6098         if (loadUnicodeSet!table(name, set))
6099             return set;
6100         throw new Exception("No unicode set for "~kind~" by name "
6101             ~name.to!string()~" was found.");
6102     }
6103     /// Compile-time checked search.
6104     static @property auto opDispatch(string name)()
6105     {
6106         static if (findSetName!table(name))
6107         {
6108             CodepointSet set;
6109             loadUnicodeSet!table(name, set);
6110             return set;
6111         }
6112         else
6113             static assert(false, "No unicode set for "~kind~" by name "
6114                 ~name~" was found.");
6115     }
6116 }
6117 
6118 // Characters that need escaping in string posed as regular expressions
6119 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6120     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6121 
6122 package(std) CodepointSet memoizeExpr(string expr)()
6123 {
6124     if (__ctfe)
6125         return mixin(expr);
6126     alias T = typeof(mixin(expr));
6127     static T slot;
6128     static bool initialized;
6129     if (!initialized)
6130     {
6131         slot =  mixin(expr);
6132         initialized = true;
6133     }
6134     return slot;
6135 }
6136 
6137 //property for \w character class
6138 package(std) @property CodepointSet wordCharacter() @safe
6139 {
6140     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6141         | unicode.Me | unicode.Nd | unicode.Pc")();
6142 }
6143 
6144 //basic stack, just in case it gets used anywhere else then Parser
6145 package(std) struct Stack(T)
6146 {
6147 @safe:
6148     T[] data;
6149     @property bool empty(){ return data.empty; }
6150 
6151     @property size_t length(){ return data.length; }
6152 
6153     void push(T val){ data ~= val;  }
6154 
6155     @trusted T pop()
6156     {
6157         assert(!empty);
6158         auto val = data[$ - 1];
6159         data = data[0 .. $ - 1];
6160         if (!__ctfe)
6161             cast(void) data.assumeSafeAppend();
6162         return val;
6163     }
6164 
6165     @property ref T top()
6166     {
6167         assert(!empty);
6168         return data[$ - 1];
6169     }
6170 }
6171 
6172 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6173 //returns it's value and skips these maxDigit chars on success, throws on failure
6174 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6175 {
6176     import std.exception : enforce;
6177     //std.conv.parse is both @system and bogus
6178     uint val;
6179     for (int k = 0; k < maxDigit; k++)
6180     {
6181         enforce(!str.empty, "incomplete escape sequence");
6182         //accepts ascii only, so it's OK to index directly
6183         immutable current = str.front;
6184         if ('0' <= current && current <= '9')
6185             val = val * 16 + current - '0';
6186         else if ('a' <= current && current <= 'f')
6187             val = val * 16 + current -'a' + 10;
6188         else if ('A' <= current && current <= 'F')
6189             val = val * 16 + current - 'A' + 10;
6190         else
6191             throw new Exception("invalid escape sequence");
6192         str.popFront();
6193     }
6194     enforce(val <= 0x10FFFF, "invalid codepoint");
6195     return val;
6196 }
6197 
6198 @safe unittest
6199 {
6200     import std.algorithm.searching : canFind;
6201     import std.exception : collectException;
6202     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6203     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6204     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6205     foreach (v; non_hex)
6206         assert(collectException(parseUniHex(v, v.length)).msg
6207           .canFind("invalid escape sequence"));
6208     foreach (i, v; hex)
6209         assert(parseUniHex(v, v.length) == value[i]);
6210     string over = "0011FFFF";
6211     assert(collectException(parseUniHex(over, over.length)).msg
6212       .canFind("invalid codepoint"));
6213 }
6214 
6215 auto caseEnclose(CodepointSet set)
6216 {
6217     auto cased = set & unicode.LC;
6218     foreach (dchar ch; cased.byCodepoint)
6219     {
6220         foreach (c; simpleCaseFoldings(ch))
6221             set |= c;
6222     }
6223     return set;
6224 }
6225 
6226 /+
6227     fetch codepoint set corresponding to a name (InBlock or binary property)
6228 +/
6229 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6230 {
6231     CodepointSet s = unicode(name);
6232     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6233     if (casefold)
6234        s = caseEnclose(s);
6235     if (negated)
6236         s = s.inverted;
6237     return s;
6238 }
6239 
6240 struct UnicodeSetParser(Range)
6241 {
6242     import std.exception : enforce;
6243     import std.typecons : tuple, Tuple;
6244     Range range;
6245     bool casefold_;
6246 
6247     @property bool empty(){ return range.empty; }
6248     @property dchar front(){ return range.front; }
6249     void popFront(){ range.popFront(); }
6250 
6251     //CodepointSet operations relatively in order of priority
6252     enum Operator:uint {
6253         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6254     }
6255 
6256     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6257     //also fetches next set operation
6258     Tuple!(CodepointSet,Operator) parseCharTerm()
6259     {
6260         import std.range : drop;
6261         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6262         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6263             PotentialTwinSymbolOperator }
6264         Operator op = Operator.None;
6265         dchar last;
6266         CodepointSet set;
6267         State state = State.Start;
6268 
6269         void addWithFlags(ref CodepointSet set, uint ch)
6270         {
6271             if (casefold_)
6272             {
6273                 auto range = simpleCaseFoldings(ch);
6274                 foreach (v; range)
6275                     set |= v;
6276             }
6277             else
6278                 set |= ch;
6279         }
6280 
6281         static Operator twinSymbolOperator(dchar symbol)
6282         {
6283             switch (symbol)
6284             {
6285             case '|':
6286                 return Operator.Union;
6287             case '-':
6288                 return Operator.Difference;
6289             case '~':
6290                 return Operator.SymDifference;
6291             case '&':
6292                 return Operator.Intersection;
6293             default:
6294                 assert(false);
6295             }
6296         }
6297 
6298         L_CharTermLoop:
6299         for (;;)
6300         {
6301             final switch (state)
6302             {
6303             case State.Start:
6304                 switch (front)
6305                 {
6306                 case '|':
6307                 case '-':
6308                 case '~':
6309                 case '&':
6310                     state = State.PotentialTwinSymbolOperator;
6311                     last = front;
6312                     break;
6313                 case '[':
6314                     op = Operator.Union;
6315                     goto case;
6316                 case ']':
6317                     break L_CharTermLoop;
6318                 case '\\':
6319                     state = State.Escape;
6320                     break;
6321                 default:
6322                     state = State.Char;
6323                     last = front;
6324                 }
6325                 break;
6326             case State.Char:
6327                 // xxx last front xxx
6328                 switch (front)
6329                 {
6330                 case '|':
6331                 case '~':
6332                 case '&':
6333                     // then last is treated as normal char and added as implicit union
6334                     state = State.PotentialTwinSymbolOperator;
6335                     addWithFlags(set, last);
6336                     last = front;
6337                     break;
6338                 case '-': // still need more info
6339                     state = State.CharDash;
6340                     break;
6341                 case '\\':
6342                     set |= last;
6343                     state = State.Escape;
6344                     break;
6345                 case '[':
6346                     op = Operator.Union;
6347                     goto case;
6348                 case ']':
6349                     addWithFlags(set, last);
6350                     break L_CharTermLoop;
6351                 default:
6352                     state = State.Char;
6353                     addWithFlags(set, last);
6354                     last = front;
6355                 }
6356                 break;
6357             case State.PotentialTwinSymbolOperator:
6358                 // xxx last front xxxx
6359                 // where last = [|-&~]
6360                 if (front == last)
6361                 {
6362                     op = twinSymbolOperator(last);
6363                     popFront();//skip second twin char
6364                     break L_CharTermLoop;
6365                 }
6366                 goto case State.Char;
6367             case State.Escape:
6368                 // xxx \ front xxx
6369                 switch (front)
6370                 {
6371                 case 'f':
6372                     last = '\f';
6373                     state = State.Char;
6374                     break;
6375                 case 'n':
6376                     last = '\n';
6377                     state = State.Char;
6378                     break;
6379                 case 'r':
6380                     last = '\r';
6381                     state = State.Char;
6382                     break;
6383                 case 't':
6384                     last = '\t';
6385                     state = State.Char;
6386                     break;
6387                 case 'v':
6388                     last = '\v';
6389                     state = State.Char;
6390                     break;
6391                 case 'c':
6392                     last = unicode.parseControlCode(this);
6393                     state = State.Char;
6394                     break;
6395                 foreach (val; Escapables)
6396                 {
6397                 case val:
6398                 }
6399                     last = front;
6400                     state = State.Char;
6401                     break;
6402                 case 'p':
6403                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6404                     state = State.Start;
6405                     continue L_CharTermLoop; //next char already fetched
6406                 case 'P':
6407                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6408                     state = State.Start;
6409                     continue L_CharTermLoop; //next char already fetched
6410                 case 'x':
6411                     popFront();
6412                     last = parseUniHex(this, 2);
6413                     state = State.Char;
6414                     continue L_CharTermLoop;
6415                 case 'u':
6416                     popFront();
6417                     last = parseUniHex(this, 4);
6418                     state = State.Char;
6419                     continue L_CharTermLoop;
6420                 case 'U':
6421                     popFront();
6422                     last = parseUniHex(this, 8);
6423                     state = State.Char;
6424                     continue L_CharTermLoop;
6425                 case 'd':
6426                     set.add(unicode.Nd);
6427                     state = State.Start;
6428                     break;
6429                 case 'D':
6430                     set.add(unicode.Nd.inverted);
6431                     state = State.Start;
6432                     break;
6433                 case 's':
6434                     set.add(unicode.White_Space);
6435                     state = State.Start;
6436                     break;
6437                 case 'S':
6438                     set.add(unicode.White_Space.inverted);
6439                     state = State.Start;
6440                     break;
6441                 case 'w':
6442                     set.add(wordCharacter);
6443                     state = State.Start;
6444                     break;
6445                 case 'W':
6446                     set.add(wordCharacter.inverted);
6447                     state = State.Start;
6448                     break;
6449                 default:
6450                     if (front >= privateUseStart && front <= privateUseEnd)
6451                         enforce(false, "no matching ']' found while parsing character class");
6452                     enforce(false, "invalid escape sequence");
6453                 }
6454                 break;
6455             case State.CharDash:
6456                 // xxx last - front xxx
6457                 switch (front)
6458                 {
6459                 case '[':
6460                     op = Operator.Union;
6461                     goto case;
6462                 case ']':
6463                     //means dash is a single char not an interval specifier
6464                     addWithFlags(set, last);
6465                     addWithFlags(set, '-');
6466                     break L_CharTermLoop;
6467                  case '-'://set Difference again
6468                     addWithFlags(set, last);
6469                     op = Operator.Difference;
6470                     popFront();//skip '-'
6471                     break L_CharTermLoop;
6472                 case '\\':
6473                     state = State.CharDashEscape;
6474                     break;
6475                 default:
6476                     enforce(last <= front, "inverted range");
6477                     if (casefold_)
6478                     {
6479                         for (uint ch = last; ch <= front; ch++)
6480                             addWithFlags(set, ch);
6481                     }
6482                     else
6483                         set.add(last, front + 1);
6484                     state = State.Start;
6485                 }
6486                 break;
6487             case State.CharDashEscape:
6488             //xxx last - \ front xxx
6489                 uint end;
6490                 switch (front)
6491                 {
6492                 case 'f':
6493                     end = '\f';
6494                     break;
6495                 case 'n':
6496                     end = '\n';
6497                     break;
6498                 case 'r':
6499                     end = '\r';
6500                     break;
6501                 case 't':
6502                     end = '\t';
6503                     break;
6504                 case 'v':
6505                     end = '\v';
6506                     break;
6507                 foreach (val; Escapables)
6508                 {
6509                 case val:
6510                 }
6511                     end = front;
6512                     break;
6513                 case 'c':
6514                     end = unicode.parseControlCode(this);
6515                     break;
6516                 case 'x':
6517                     popFront();
6518                     end = parseUniHex(this, 2);
6519                     enforce(last <= end,"inverted range");
6520                     set.add(last, end + 1);
6521                     state = State.Start;
6522                     continue L_CharTermLoop;
6523                 case 'u':
6524                     popFront();
6525                     end = parseUniHex(this, 4);
6526                     enforce(last <= end,"inverted range");
6527                     set.add(last, end + 1);
6528                     state = State.Start;
6529                     continue L_CharTermLoop;
6530                 case 'U':
6531                     popFront();
6532                     end = parseUniHex(this, 8);
6533                     enforce(last <= end,"inverted range");
6534                     set.add(last, end + 1);
6535                     state = State.Start;
6536                     continue L_CharTermLoop;
6537                 default:
6538                     if (front >= privateUseStart && front <= privateUseEnd)
6539                         enforce(false, "no matching ']' found while parsing character class");
6540                     enforce(false, "invalid escape sequence");
6541                 }
6542                 // Lookahead to check if it's a \T
6543                 // where T is sub-pattern terminator in multi-pattern scheme
6544                 auto lookahead = range.save.drop(1);
6545                 if (end == '\\' && !lookahead.empty)
6546                 {
6547                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6548                         enforce(false, "no matching ']' found while parsing character class");
6549                 }
6550                 enforce(last <= end,"inverted range");
6551                 set.add(last, end + 1);
6552                 state = State.Start;
6553                 break;
6554             }
6555             popFront();
6556             enforce(!empty, "unexpected end of CodepointSet");
6557         }
6558         return tuple(set, op);
6559     }
6560 
6561     alias ValStack = Stack!(CodepointSet);
6562     alias OpStack = Stack!(Operator);
6563 
6564     CodepointSet parseSet()
6565     {
6566         ValStack vstack;
6567         OpStack opstack;
6568         import std.functional : unaryFun;
6569         enforce(!empty, "unexpected end of input");
6570         enforce(front == '[', "expected '[' at the start of unicode set");
6571         //
6572         static bool apply(Operator op, ref ValStack stack)
6573         {
6574             switch (op)
6575             {
6576             case Operator.Negate:
6577                 enforce(!stack.empty, "no operand for '^'");
6578                 stack.top = stack.top.inverted;
6579                 break;
6580             case Operator.Union:
6581                 auto s = stack.pop();//2nd operand
6582                 enforce(!stack.empty, "no operand for '||'");
6583                 stack.top.add(s);
6584                 break;
6585             case Operator.Difference:
6586                 auto s = stack.pop();//2nd operand
6587                 enforce(!stack.empty, "no operand for '--'");
6588                 stack.top.sub(s);
6589                 break;
6590             case Operator.SymDifference:
6591                 auto s = stack.pop();//2nd operand
6592                 enforce(!stack.empty, "no operand for '~~'");
6593                 stack.top ~= s;
6594                 break;
6595             case Operator.Intersection:
6596                 auto s = stack.pop();//2nd operand
6597                 enforce(!stack.empty, "no operand for '&&'");
6598                 stack.top.intersect(s);
6599                 break;
6600             default:
6601                 return false;
6602             }
6603             return true;
6604         }
6605         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6606         {
6607             while (cond(opstack.top))
6608             {
6609                 if (!apply(opstack.pop(),vstack))
6610                     return false;//syntax error
6611                 if (opstack.empty)
6612                     return false;
6613             }
6614             return true;
6615         }
6616 
6617         L_CharsetLoop:
6618         do
6619         {
6620             switch (front)
6621             {
6622             case '[':
6623                 opstack.push(Operator.Open);
6624                 popFront();
6625                 enforce(!empty, "unexpected end of character class");
6626                 if (front == '^')
6627                 {
6628                     opstack.push(Operator.Negate);
6629                     popFront();
6630                     enforce(!empty, "unexpected end of character class");
6631                 }
6632                 else if (front == ']') // []...] is special cased
6633                 {
6634                     popFront();
6635                     enforce(!empty, "wrong character set");
6636                     auto pair = parseCharTerm();
6637                     pair[0].add(']', ']'+1);
6638                     if (pair[1] != Operator.None)
6639                     {
6640                         if (opstack.top == Operator.Union)
6641                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6642                         opstack.push(pair[1]);
6643                     }
6644                     vstack.push(pair[0]);
6645                 }
6646                 break;
6647             case ']':
6648                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6649                     "character class syntax error");
6650                 enforce(!opstack.empty, "unmatched ']'");
6651                 opstack.pop();
6652                 popFront();
6653                 if (opstack.empty)
6654                     break L_CharsetLoop;
6655                 auto pair  = parseCharTerm();
6656                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6657                 {
6658                     vstack.top.add(pair[0]);//apply union
6659                 }
6660                 if (pair[1] != Operator.None)
6661                 {
6662                     if (opstack.top == Operator.Union)
6663                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6664                     opstack.push(pair[1]);
6665                 }
6666                 break;
6667             //
6668             default://yet another pair of term(op)?
6669                 auto pair = parseCharTerm();
6670                 if (pair[1] != Operator.None)
6671                 {
6672                     if (opstack.top == Operator.Union)
6673                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6674                     opstack.push(pair[1]);
6675                 }
6676                 vstack.push(pair[0]);
6677             }
6678 
6679         }while (!empty || !opstack.empty);
6680         while (!opstack.empty)
6681             apply(opstack.pop(),vstack);
6682         assert(vstack.length == 1);
6683         return vstack.top;
6684     }
6685 }
6686 
6687 /**
6688     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6689     a block, script or general category.
6690 
6691     It uses well defined standard rules of property name lookup.
6692     This includes fuzzy matching of names, so that
6693     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6694     and yield the same set of white space $(CHARACTERS).
6695 */
6696 @safe public struct unicode
6697 {
6698     import std.exception : enforce;
6699     /**
6700         Performs the lookup of set of $(CODEPOINTS)
6701         with compile-time correctness checking.
6702         This short-cut version combines 3 searches:
6703         across blocks, scripts, and common binary properties.
6704 
6705         Note that since scripts and blocks overlap the
6706         usual trick to disambiguate is used - to get a block use
6707         `unicode.InBlockName`, to search a script
6708         use `unicode.ScriptName`.
6709 
6710         See_Also: $(LREF block), $(LREF script)
6711         and (not included in this search) $(LREF hangulSyllableType).
6712     */
6713 
6714     static @property auto opDispatch(string name)() pure
6715     {
6716         static if (findAny(name))
6717             return loadAny(name);
6718         else
6719             static assert(false, "No unicode set by name "~name~" was found.");
6720     }
6721 
6722     ///
6723     @safe unittest
6724     {
6725         import std.exception : collectException;
6726         auto ascii = unicode.ASCII;
6727         assert(ascii['A']);
6728         assert(ascii['~']);
6729         assert(!ascii['\u00e0']);
6730         // matching is case-insensitive
6731         assert(ascii == unicode.ascII);
6732         assert(!ascii['à']);
6733         // underscores, '-' and whitespace in names are ignored too
6734         auto latin = unicode.in_latin1_Supplement;
6735         assert(latin['à']);
6736         assert(!latin['$']);
6737         // BTW Latin 1 Supplement is a block, hence "In" prefix
6738         assert(latin == unicode("In Latin 1 Supplement"));
6739         // run-time look up throws if no such set is found
6740         assert(collectException(unicode("InCyrilliac")));
6741     }
6742 
6743     /**
6744         The same lookup across blocks, scripts, or binary properties,
6745         but performed at run-time.
6746         This version is provided for cases where `name`
6747         is not known beforehand; otherwise compile-time
6748         checked $(LREF opDispatch) is typically a better choice.
6749 
6750         See the $(S_LINK Unicode properties, table of properties) for available
6751         sets.
6752     */
6753     static auto opCall(C)(const scope C[] name)
6754         if (is(C : dchar))
6755     {
6756         return loadAny(name);
6757     }
6758 
6759     /**
6760         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6761 
6762         Note:
6763         Here block names are unambiguous as no scripts are searched
6764         and thus to search use simply `unicode.block.BlockName` notation.
6765 
6766         See $(S_LINK Unicode properties, table of properties) for available sets.
6767         See_Also: $(S_LINK Unicode properties, table of properties).
6768     */
6769     struct block
6770     {
6771         import std.internal.unicode_tables : blocks; // generated file
6772         mixin SetSearcher!(blocks.tab, "block");
6773     }
6774 
6775     ///
6776     @safe unittest
6777     {
6778         // use .block for explicitness
6779         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6780     }
6781 
6782     /**
6783         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6784 
6785         See the $(S_LINK Unicode properties, table of properties) for available
6786         sets.
6787     */
6788     struct script
6789     {
6790         import std.internal.unicode_tables : scripts; // generated file
6791         mixin SetSearcher!(scripts.tab, "script");
6792     }
6793 
6794     ///
6795     @safe unittest
6796     {
6797         auto arabicScript = unicode.script.arabic;
6798         auto arabicBlock = unicode.block.arabic;
6799         // there is an intersection between script and block
6800         assert(arabicBlock['؁']);
6801         assert(arabicScript['؁']);
6802         // but they are different
6803         assert(arabicBlock != arabicScript);
6804         assert(arabicBlock == unicode.inArabic);
6805         assert(arabicScript == unicode.arabic);
6806     }
6807 
6808     /**
6809         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6810 
6811         Other non-binary properties (once supported) follow the same
6812         notation - `unicode.propertyName.propertyValue` for compile-time
6813         checked access and `unicode.propertyName(propertyValue)`
6814         for run-time checked one.
6815 
6816         See the $(S_LINK Unicode properties, table of properties) for available
6817         sets.
6818     */
6819     struct hangulSyllableType
6820     {
6821         import std.internal.unicode_tables : hangul; // generated file
6822         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6823     }
6824 
6825     ///
6826     @safe unittest
6827     {
6828         // L here is syllable type not Letter as in unicode.L short-cut
6829         auto leadingVowel = unicode.hangulSyllableType("L");
6830         // check that some leading vowels are present
6831         foreach (vowel; '\u1110'..'\u115F')
6832             assert(leadingVowel[vowel]);
6833         assert(leadingVowel == unicode.hangulSyllableType.L);
6834     }
6835 
6836     //parse control code of form \cXXX, c assumed to be the current symbol
6837     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6838     {
6839         with(p)
6840         {
6841             popFront();
6842             enforce(!empty, "Unfinished escape sequence");
6843             enforce(('a' <= front && front <= 'z')
6844                 || ('A' <= front && front <= 'Z'),
6845             "Only letters are allowed after \\c");
6846             return front & 0x1f;
6847         }
6848     }
6849 
6850     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6851     //\ - assumed to be processed, p - is current
6852     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6853         bool negated, bool casefold)
6854     {
6855         static import std.ascii;
6856         with(p)
6857         {
6858             enum MAX_PROPERTY = 128;
6859             char[MAX_PROPERTY] result;
6860             uint k = 0;
6861             popFront();
6862             enforce(!empty, "eof parsing unicode property spec");
6863             if (front == '{')
6864             {
6865                 popFront();
6866                 while (k < MAX_PROPERTY && !empty && front !='}'
6867                     && front !=':')
6868                 {
6869                     if (front != '-' && front != ' ' && front != '_')
6870                         result[k++] = cast(char) std.ascii.toLower(front);
6871                     popFront();
6872                 }
6873                 enforce(k != MAX_PROPERTY, "invalid property name");
6874                 enforce(front == '}', "} expected ");
6875             }
6876             else
6877             {//single char properties e.g.: \pL, \pN ...
6878                 enforce(front < 0x80, "invalid property name");
6879                 result[k++] = cast(char) front;
6880             }
6881             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6882             enforce(!s.empty, "unrecognized unicode property spec");
6883             popFront();
6884             return s;
6885         }
6886     }
6887 
6888     /**
6889         Parse unicode codepoint set from given `range` using standard regex
6890         syntax '[...]'. The range is advanced skiping over regex set definition.
6891         `casefold` parameter determines if the set should be casefolded - that is
6892         include both lower and upper case versions for any letters in the set.
6893     */
6894     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6895     if (isInputRange!Range && is(ElementType!Range : dchar))
6896     {
6897         auto usParser = UnicodeSetParser!Range(range, casefold);
6898         auto set = usParser.parseSet();
6899         range = usParser.range;
6900         return set;
6901     }
6902 
6903     ///
6904     @safe unittest
6905     {
6906         import std.uni : unicode;
6907         string pat = "[a-zA-Z0-9]hello";
6908         auto set = unicode.parseSet(pat);
6909         // check some of the codepoints
6910         assert(set['a'] && set['A'] && set['9']);
6911         assert(pat == "hello");
6912     }
6913 
6914 private:
6915     alias ucmp = comparePropertyName;
6916 
6917     static bool findAny(string name)
6918     {
6919         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6920         return isPrettyPropertyName(name)
6921             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6922             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6923     }
6924 
6925     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6926     {
6927         import std.conv : to;
6928         import std.internal.unicode_tables : blocks, scripts; // generated file
6929         Set set;
6930         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6931             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6932                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6933         if (loaded)
6934             return set;
6935         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6936     }
6937 
6938     // FIXME: re-disable once the compiler is fixed
6939     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6940     //@disable ~this();
6941 }
6942 
6943 @safe unittest
6944 {
6945     import std.internal.unicode_tables : blocks, uniProps; // generated file
6946     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6947     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6948     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6949 }
6950 
6951 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6952 
6953 // control - '\r'
6954 enum controlSwitch = `
6955     case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6956     case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6957 `;
6958 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6959 // kill unrolled switches
6960 
6961 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6962 {
6963     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6964 }
6965 
6966 template genericDecodeGrapheme(bool getValue)
6967 {
6968     alias graphemeExtend = graphemeExtendTrie;
6969     alias spacingMark = mcTrie;
6970     static if (getValue)
6971         alias Value = Grapheme;
6972     else
6973         alias Value = void;
6974 
6975     Value genericDecodeGrapheme(Input)(ref Input range)
6976     {
6977         import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6978         enum GraphemeState {
6979             Start,
6980             CR,
6981             RI,
6982             L,
6983             V,
6984             LVT
6985         }
6986         static if (getValue)
6987             Grapheme grapheme;
6988         auto state = GraphemeState.Start;
6989         enum eat = q{
6990             static if (getValue)
6991                 grapheme ~= ch;
6992             range.popFront();
6993         };
6994 
6995         dchar ch;
6996         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6997         while (!range.empty)
6998         {
6999             ch = range.front;
7000             final switch (state) with(GraphemeState)
7001             {
7002             case Start:
7003                 mixin(eat);
7004                 if (ch == '\r')
7005                     state = CR;
7006                 else if (isRegionalIndicator(ch))
7007                     state = RI;
7008                 else if (isHangL(ch))
7009                     state = L;
7010                 else if (hangLV[ch] || isHangV(ch))
7011                     state = V;
7012                 else if (hangLVT[ch])
7013                     state = LVT;
7014                 else if (isHangT(ch))
7015                     state = LVT;
7016                 else
7017                 {
7018                     switch (ch)
7019                     {
7020                     mixin(controlSwitch);
7021                         goto L_End;
7022                     default:
7023                         goto L_End_Extend;
7024                     }
7025                 }
7026             break;
7027             case CR:
7028                 if (ch == '\n')
7029                     mixin(eat);
7030                 goto L_End_Extend;
7031             case RI:
7032                 if (isRegionalIndicator(ch))
7033                     mixin(eat);
7034                 else
7035                     goto L_End_Extend;
7036             break;
7037             case L:
7038                 if (isHangL(ch))
7039                     mixin(eat);
7040                 else if (isHangV(ch) || hangLV[ch])
7041                 {
7042                     state = V;
7043                     mixin(eat);
7044                 }
7045                 else if (hangLVT[ch])
7046                 {
7047                     state = LVT;
7048                     mixin(eat);
7049                 }
7050                 else
7051                     goto L_End_Extend;
7052             break;
7053             case V:
7054                 if (isHangV(ch))
7055                     mixin(eat);
7056                 else if (isHangT(ch))
7057                 {
7058                     state = LVT;
7059                     mixin(eat);
7060                 }
7061                 else
7062                     goto L_End_Extend;
7063             break;
7064             case LVT:
7065                 if (isHangT(ch))
7066                 {
7067                     mixin(eat);
7068                 }
7069                 else
7070                     goto L_End_Extend;
7071             break;
7072             }
7073         }
7074     L_End_Extend:
7075         while (!range.empty)
7076         {
7077             ch = range.front;
7078             // extend & spacing marks
7079             if (!graphemeExtend[ch] && !spacingMark[ch])
7080                 break;
7081             mixin(eat);
7082         }
7083     L_End:
7084         static if (getValue)
7085             return grapheme;
7086     }
7087 
7088 }
7089 
7090 public: // Public API continues
7091 
7092 /++
7093     Computes the length of grapheme cluster starting at `index`.
7094     Both the resulting length and the `index` are measured
7095     in $(S_LINK Code unit, code units).
7096 
7097     Params:
7098         C = type that is implicitly convertible to `dchars`
7099         input = array of grapheme clusters
7100         index = starting index into `input[]`
7101 
7102     Returns:
7103         length of grapheme cluster
7104 +/
7105 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7106 if (is(C : dchar))
7107 {
7108     auto src = input[index..$];
7109     auto n = src.length;
7110     genericDecodeGrapheme!(false)(src);
7111     return n - src.length;
7112 }
7113 
7114 ///
7115 @safe unittest
7116 {
7117     assert(graphemeStride("  ", 1) == 1);
7118     // A + combing ring above
7119     string city = "A\u030Arhus";
7120     size_t first = graphemeStride(city, 0);
7121     assert(first == 3); //\u030A has 2 UTF-8 code units
7122     assert(city[0 .. first] == "A\u030A");
7123     assert(city[first..$] == "rhus");
7124 }
7125 
7126 @safe unittest
7127 {
7128     // Ensure that graphemeStride is usable from CTFE.
7129     enum c1 = graphemeStride("A", 0);
7130     static assert(c1 == 1);
7131 
7132     enum c2 = graphemeStride("A\u0301", 0);
7133     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7134 }
7135 
7136 /++
7137     Reads one full grapheme cluster from an
7138     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7139 
7140     For examples see the $(LREF Grapheme) below.
7141 
7142     Note:
7143     This function modifies `inp` and thus `inp`
7144     must be an L-value.
7145 +/
7146 Grapheme decodeGrapheme(Input)(ref Input inp)
7147 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7148 {
7149     return genericDecodeGrapheme!true(inp);
7150 }
7151 
7152 @system unittest
7153 {
7154     import std.algorithm.comparison : equal;
7155 
7156     Grapheme gr;
7157     string s = " \u0020\u0308 ";
7158     gr = decodeGrapheme(s);
7159     assert(gr.length == 1 && gr[0] == ' ');
7160     gr = decodeGrapheme(s);
7161     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7162     s = "\u0300\u0308\u1100";
7163     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7164     assert(equal(decodeGrapheme(s)[], "\u1100"));
7165     s = "\u11A8\u0308\uAC01";
7166     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7167     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7168 }
7169 
7170 /++
7171     $(P Iterate a string by $(LREF Grapheme).)
7172 
7173     $(P Useful for doing string manipulation that needs to be aware
7174     of graphemes.)
7175 
7176     See_Also:
7177         $(LREF byCodePoint)
7178 +/
7179 auto byGrapheme(Range)(Range range)
7180 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7181 {
7182     // TODO: Bidirectional access
7183     static struct Result(R)
7184     {
7185         private R _range;
7186         private Grapheme _front;
7187 
7188         bool empty() @property
7189         {
7190             return _front.length == 0;
7191         }
7192 
7193         Grapheme front() @property
7194         {
7195             return _front;
7196         }
7197 
7198         void popFront()
7199         {
7200             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7201         }
7202 
7203         static if (isForwardRange!R)
7204         {
7205             Result save() @property
7206             {
7207                 return Result(_range.save, _front);
7208             }
7209         }
7210     }
7211 
7212     auto result = Result!(Range)(range);
7213     result.popFront();
7214     return result;
7215 }
7216 
7217 ///
7218 @safe unittest
7219 {
7220     import std.algorithm.comparison : equal;
7221     import std.range.primitives : walkLength;
7222     import std.range : take, drop;
7223     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7224     assert(text.walkLength == 5); // 5 code points
7225 
7226     auto gText = text.byGrapheme;
7227     assert(gText.walkLength == 4); // 4 graphemes
7228 
7229     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7230     assert(gText.drop(3).equal("l".byGrapheme));
7231 }
7232 
7233 // For testing non-forward-range input ranges
7234 version (StdUnittest)
7235 private static struct InputRangeString
7236 {
7237     private string s;
7238 
7239     bool empty() @property { return s.empty; }
7240     dchar front() @property { return s.front; }
7241     void popFront() { s.popFront(); }
7242 }
7243 
7244 @system unittest
7245 {
7246     import std.algorithm.comparison : equal;
7247     import std.array : array;
7248     import std.range : retro;
7249     import std.range.primitives : walkLength;
7250     assert("".byGrapheme.walkLength == 0);
7251 
7252     auto reverse = "le\u0308on";
7253     assert(reverse.walkLength == 5);
7254 
7255     auto gReverse = reverse.byGrapheme;
7256     assert(gReverse.walkLength == 4);
7257 
7258     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7259     {{
7260         assert(text.walkLength == 5);
7261         static assert(isForwardRange!(typeof(text)));
7262 
7263         auto gText = text.byGrapheme;
7264         static assert(isForwardRange!(typeof(gText)));
7265         assert(gText.walkLength == 4);
7266         assert(gText.array.retro.equal(gReverse));
7267     }}
7268 
7269     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7270     static assert(!isForwardRange!(typeof(nonForwardRange)));
7271     assert(nonForwardRange.walkLength == 4);
7272 }
7273 
7274 /++
7275     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7276 
7277     $(P Useful for converting the result to a string after doing operations
7278     on graphemes.)
7279 
7280     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7281 +/
7282 auto byCodePoint(Range)(Range range)
7283 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7284 {
7285     // TODO: Propagate bidirectional access
7286     static struct Result
7287     {
7288         private Range _range;
7289         private size_t i = 0;
7290 
7291         bool empty() @property
7292         {
7293             return _range.empty;
7294         }
7295 
7296         dchar front() @property
7297         {
7298             return _range.front[i];
7299         }
7300 
7301         void popFront()
7302         {
7303             ++i;
7304 
7305             if (i >= _range.front.length)
7306             {
7307                 _range.popFront();
7308                 i = 0;
7309             }
7310         }
7311 
7312         static if (isForwardRange!Range)
7313         {
7314             Result save() @property
7315             {
7316                 return Result(_range.save, i);
7317             }
7318         }
7319     }
7320 
7321     return Result(range);
7322 }
7323 
7324 /// Ditto
7325 auto byCodePoint(Range)(Range range)
7326 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7327 {
7328     import std.range.primitives : isBidirectionalRange, popBack;
7329     import std.traits : isNarrowString;
7330     static if (isNarrowString!Range)
7331     {
7332         static struct Result
7333         {
7334             private Range _range;
7335             @property bool empty() { return _range.empty; }
7336             @property dchar front(){ return _range.front; }
7337             void popFront(){ _range.popFront; }
7338             @property auto save() { return Result(_range.save); }
7339             @property dchar back(){ return _range.back; }
7340             void popBack(){ _range.popBack; }
7341         }
7342         static assert(isBidirectionalRange!(Result));
7343         return Result(range);
7344     }
7345     else
7346         return range;
7347 }
7348 
7349 ///
7350 @safe unittest
7351 {
7352     import std.array : array;
7353     import std.conv : text;
7354     import std.range : retro;
7355 
7356     string s = "noe\u0308l"; // noël
7357 
7358     // reverse it and convert the result to a string
7359     string reverse = s.byGrapheme
7360         .array
7361         .retro
7362         .byCodePoint
7363         .text;
7364 
7365     assert(reverse == "le\u0308on"); // lëon
7366 }
7367 
7368 @system unittest
7369 {
7370     import std.algorithm.comparison : equal;
7371     import std.range.primitives : walkLength;
7372     import std.range : retro;
7373     assert("".byGrapheme.byCodePoint.equal(""));
7374 
7375     string text = "noe\u0308l";
7376     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7377 
7378     auto gText = InputRangeString(text).byGrapheme;
7379     static assert(!isForwardRange!(typeof(gText)));
7380 
7381     auto cpText = gText.byCodePoint;
7382     static assert(!isForwardRange!(typeof(cpText)));
7383 
7384     assert(cpText.walkLength == text.walkLength);
7385 
7386     auto plainCp = text.byCodePoint;
7387     static assert(isForwardRange!(typeof(plainCp)));
7388     assert(equal(plainCp, text));
7389     assert(equal(retro(plainCp.save), retro(text.save)));
7390     // Check that we still have length for dstring
7391     assert("абвгд"d.byCodePoint.length == 5);
7392 }
7393 
7394 /++
7395     $(P A structure designed to effectively pack $(CHARACTERS)
7396     of a $(CLUSTER).
7397     )
7398 
7399     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7400     always refer to distinct objects. In most actual scenarios a `Grapheme`
7401     fits on the stack and avoids memory allocation overhead for all but quite
7402     long clusters.
7403     )
7404 
7405     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7406 +/
7407 @safe struct Grapheme
7408 {
7409     import std.exception : enforce;
7410     import std.traits : isDynamicArray;
7411 
7412 public:
7413     /// Ctor
7414     this(C)(const scope C[] chars...)
7415         if (is(C : dchar))
7416     {
7417         this ~= chars;
7418     }
7419 
7420     ///ditto
7421     this(Input)(Input seq)
7422         if (!isDynamicArray!Input
7423             && isInputRange!Input && is(ElementType!Input : dchar))
7424     {
7425         this ~= seq;
7426     }
7427 
7428     /// Gets a $(CODEPOINT) at the given index in this cluster.
7429     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7430     {
7431         assert(index < length);
7432         return read24(isBig ? ptr_ : small_.ptr, index);
7433     }
7434 
7435     /++
7436         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7437 
7438         Warning:
7439         Use of this facility may invalidate grapheme cluster,
7440         see also $(LREF Grapheme.valid).
7441     +/
7442     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7443     {
7444         assert(index < length);
7445         write24(isBig ? ptr_ : small_.ptr, ch, index);
7446     }
7447 
7448     ///
7449     @safe unittest
7450     {
7451         auto g = Grapheme("A\u0302");
7452         assert(g[0] == 'A');
7453         assert(g.valid);
7454         g[1] = '~'; // ASCII tilda is not a combining mark
7455         assert(g[1] == '~');
7456         assert(!g.valid);
7457     }
7458 
7459     /++
7460         Random-access range over Grapheme's $(CHARACTERS).
7461 
7462         Warning: Invalidates when this Grapheme leaves the scope,
7463         attempts to use it then would lead to memory corruption.
7464     +/
7465     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7466     {
7467         return sliceOverIndexed(a, b, &this);
7468     }
7469 
7470     /// ditto
7471     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7472     {
7473         return sliceOverIndexed(0, length, &this);
7474     }
7475 
7476     /// Grapheme cluster length in $(CODEPOINTS).
7477     @property size_t length() const @nogc nothrow pure
7478     {
7479         return isBig ? len_ : slen_ & 0x7F;
7480     }
7481 
7482     /++
7483         Append $(CHARACTER) `ch` to this grapheme.
7484         Warning:
7485         Use of this facility may invalidate grapheme cluster,
7486         see also `valid`.
7487 
7488         See_Also: $(LREF Grapheme.valid)
7489     +/
7490     ref opOpAssign(string op)(dchar ch) @trusted
7491     {
7492         static if (op == "~")
7493         {
7494             import std.internal.memory : enforceRealloc;
7495             if (!isBig)
7496             {
7497                 if (slen_ == small_cap)
7498                     convertToBig();// & fallthrough to "big" branch
7499                 else
7500                 {
7501                     write24(small_.ptr, ch, smallLength);
7502                     slen_++;
7503                     return this;
7504                 }
7505             }
7506 
7507             assert(isBig);
7508             if (len_ == cap_)
7509             {
7510                 import core.checkedint : addu, mulu;
7511                 bool overflow;
7512                 cap_ = addu(cap_, grow, overflow);
7513                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7514                 if (overflow) assert(0);
7515                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7516             }
7517             write24(ptr_, ch, len_++);
7518             return this;
7519         }
7520         else
7521             static assert(false, "No operation "~op~" defined for Grapheme");
7522     }
7523 
7524     ///
7525     @system unittest
7526     {
7527         import std.algorithm.comparison : equal;
7528         auto g = Grapheme("A");
7529         assert(g.valid);
7530         g ~= '\u0301';
7531         assert(g[].equal("A\u0301"));
7532         assert(g.valid);
7533         g ~= "B";
7534         // not a valid grapheme cluster anymore
7535         assert(!g.valid);
7536         // still could be useful though
7537         assert(g[].equal("A\u0301B"));
7538     }
7539 
7540     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7541     ref opOpAssign(string op, Input)(scope Input inp)
7542         if (isInputRange!Input && is(ElementType!Input : dchar))
7543     {
7544         static if (op == "~")
7545         {
7546             foreach (dchar ch; inp)
7547                 this ~= ch;
7548             return this;
7549         }
7550         else
7551             static assert(false, "No operation "~op~" defined for Grapheme");
7552     }
7553 
7554     /++
7555         True if this object contains valid extended grapheme cluster.
7556         Decoding primitives of this module always return a valid `Grapheme`.
7557 
7558         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7559         render it no longer valid. Certain applications may chose to use
7560         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7561         entirely.
7562     +/
7563     @property bool valid()() /*const*/
7564     {
7565         auto r = this[];
7566         genericDecodeGrapheme!false(r);
7567         return r.length == 0;
7568     }
7569 
7570     this(this) @nogc nothrow pure @trusted
7571     {
7572         import std.internal.memory : enforceMalloc;
7573         if (isBig)
7574         {// dup it
7575             import core.checkedint : addu, mulu;
7576             bool overflow;
7577             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7578             if (overflow) assert(0);
7579 
7580             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7581             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7582             ptr_ = p;
7583         }
7584     }
7585 
7586     ~this() @nogc nothrow pure @trusted
7587     {
7588         import core.memory : pureFree;
7589         if (isBig)
7590         {
7591             pureFree(ptr_);
7592         }
7593     }
7594 
7595 
7596 private:
7597     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7598     // "out of the blue" grow rate, needs testing
7599     // (though graphemes are typically small < 9)
7600     enum grow = 20;
7601     enum small_cap = small_bytes/3;
7602     enum small_flag = 0x80, small_mask = 0x7F;
7603     // 16 bytes in 32bits, should be enough for the majority of cases
7604     union
7605     {
7606         struct
7607         {
7608             ubyte* ptr_;
7609             size_t cap_;
7610             size_t len_;
7611             size_t padding_;
7612         }
7613         struct
7614         {
7615             ubyte[small_bytes] small_;
7616             ubyte slen_;
7617         }
7618     }
7619 
7620     void convertToBig() @nogc nothrow pure @trusted
7621     {
7622         import std.internal.memory : enforceMalloc;
7623         static assert(grow.max / 3 - 1 >= grow);
7624         enum nbytes = 3 * (grow + 1);
7625         size_t k = smallLength;
7626         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7627         for (int i=0; i<k; i++)
7628             write24(p, read24(small_.ptr, i), i);
7629         // now we can overwrite small array data
7630         ptr_ = p;
7631         len_ = slen_;
7632         assert(grow > len_);
7633         cap_ = grow;
7634         setBig();
7635     }
7636 
7637     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7638 
7639     @property size_t smallLength() const @nogc nothrow pure
7640     {
7641         return slen_ & small_mask;
7642     }
7643     @property ubyte isBig() const @nogc nothrow pure
7644     {
7645         return slen_ & small_flag;
7646     }
7647 }
7648 
7649 static assert(Grapheme.sizeof == size_t.sizeof*4);
7650 
7651 
7652 @system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7653 {
7654     import std.algorithm.comparison : equal;
7655     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7656     assert(byGrapheme("ЮУЗ").equal(data[]));
7657 }
7658 
7659 ///
7660 @system unittest
7661 {
7662     import std.algorithm.comparison : equal;
7663     import std.algorithm.iteration : filter;
7664     import std.range : isRandomAccessRange;
7665 
7666     string bold = "ku\u0308hn";
7667 
7668     // note that decodeGrapheme takes parameter by ref
7669     auto first = decodeGrapheme(bold);
7670 
7671     assert(first.length == 1);
7672     assert(first[0] == 'k');
7673 
7674     // the next grapheme is 2 characters long
7675     auto wideOne = decodeGrapheme(bold);
7676     // slicing a grapheme yields a random-access range of dchar
7677     assert(wideOne[].equal("u\u0308"));
7678     assert(wideOne.length == 2);
7679     static assert(isRandomAccessRange!(typeof(wideOne[])));
7680 
7681     // all of the usual range manipulation is possible
7682     assert(wideOne[].filter!isMark().equal("\u0308"));
7683 
7684     auto g = Grapheme("A");
7685     assert(g.valid);
7686     g ~= '\u0301';
7687     assert(g[].equal("A\u0301"));
7688     assert(g.valid);
7689     g ~= "B";
7690     // not a valid grapheme cluster anymore
7691     assert(!g.valid);
7692     // still could be useful though
7693     assert(g[].equal("A\u0301B"));
7694 }
7695 
7696 @safe unittest
7697 {
7698     auto g = Grapheme("A\u0302");
7699     assert(g[0] == 'A');
7700     assert(g.valid);
7701     g[1] = '~'; // ASCII tilda is not a combining mark
7702     assert(g[1] == '~');
7703     assert(!g.valid);
7704 }
7705 
7706 @system unittest
7707 {
7708     import std.algorithm.comparison : equal;
7709     import std.algorithm.iteration : map;
7710     import std.conv : text;
7711     import std.range : iota;
7712 
7713     // not valid clusters (but it just a test)
7714     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7715     assert(g[0] == 'a');
7716     assert(g[1] == 'b');
7717     assert(g[2] == 'c');
7718     assert(g[3] == 'd');
7719     assert(g[4] == 'e');
7720     g[3] = 'Й';
7721     assert(g[2] == 'c');
7722     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7723     assert(g[4] == 'e');
7724     assert(!g.valid);
7725 
7726     g ~= 'ц';
7727     g ~= '~';
7728     assert(g[0] == 'a');
7729     assert(g[1] == 'b');
7730     assert(g[2] == 'c');
7731     assert(g[3] == 'Й');
7732     assert(g[4] == 'e');
7733     assert(g[5] == 'ц');
7734     assert(g[6] == '~');
7735     assert(!g.valid);
7736 
7737     Grapheme copy = g;
7738     copy[0] = 'X';
7739     copy[1] = '-';
7740     assert(g[0] == 'a' && copy[0] == 'X');
7741     assert(g[1] == 'b' && copy[1] == '-');
7742     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7743     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7744     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7745     copy ~= "xyz";
7746     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7747     assert(!copy.valid);
7748 
7749     Grapheme h;
7750     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7751         h ~= v;
7752     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7753 }
7754 
7755 /++
7756     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7757     This function uses simpler comparison rule thus achieving better performance
7758     than $(LREF icmp). However keep in mind the warning below.)
7759 
7760     Params:
7761         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7762         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7763 
7764     Returns:
7765         An `int` that is 0 if the strings match,
7766         &lt;0 if `r1` is lexicographically "less" than `r2`,
7767         &gt;0 if `r1` is lexicographically "greater" than `r2`
7768 
7769     Warning:
7770     This function only handles 1:1 $(CODEPOINT) mapping
7771     and thus is not sufficient for certain alphabets
7772     like German, Greek and few others.
7773 
7774     See_Also:
7775         $(LREF icmp)
7776         $(REF cmp, std,algorithm,comparison)
7777 +/
7778 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7779 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7780     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7781 {
7782     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7783     import std.range.primitives : isInfinite;
7784     import std.utf : decodeFront;
7785     import std.traits : isDynamicArray;
7786     import std.typecons : Yes;
7787     static import std.ascii;
7788 
7789     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7790         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7791         && !(isInfinite!S1 && isInfinite!S2)
7792         && __traits(compiles,
7793             {
7794                 size_t s = size_t.sizeof / 2;
7795                 r1 = r1[s .. $];
7796                 r2 = r2[s .. $];
7797             }))
7798     {{
7799         // ASCII optimization for dynamic arrays & similar.
7800         size_t i = 0;
7801         static if (isInfinite!S1)
7802             immutable end = r2.length;
7803         else static if (isInfinite!S2)
7804             immutable end = r1.length;
7805         else
7806             immutable end = r1.length > r2.length ? r2.length : r1.length;
7807         for (; i < end; ++i)
7808         {
7809             auto lhs = r1[i];
7810             auto rhs = r2[i];
7811             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7812             if (lhs == rhs) continue;
7813             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7814             if (lowDiff) return lowDiff;
7815         }
7816         static if (isInfinite!S1)
7817             return 1;
7818         else static if (isInfinite!S2)
7819             return -1;
7820         else
7821             return (r1.length > r2.length) - (r2.length > r1.length);
7822 
7823     NonAsciiPath:
7824         r1 = r1[i .. $];
7825         r2 = r2[i .. $];
7826         // Fall through to standard case.
7827     }}
7828 
7829     while (!r1.empty)
7830     {
7831         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7832         if (r2.empty)
7833             return 1;
7834         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
7835         int diff = lhs - rhs;
7836         if (!diff)
7837             continue;
7838         if ((lhs | rhs) < 0x80)
7839         {
7840             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7841             if (!d) continue;
7842             return d;
7843         }
7844         size_t idx = simpleCaseTrie[lhs];
7845         size_t idx2 = simpleCaseTrie[rhs];
7846         // simpleCaseTrie is packed index table
7847         if (idx != EMPTY_CASE_TRIE)
7848         {
7849             if (idx2 != EMPTY_CASE_TRIE)
7850             {// both cased chars
7851                 // adjust idx --> start of bucket
7852                 idx = idx - sTable[idx].n;
7853                 idx2 = idx2 - sTable[idx2].n;
7854                 if (idx == idx2)// one bucket, equivalent chars
7855                     continue;
7856                 else//  not the same bucket
7857                     diff = sTable[idx].ch - sTable[idx2].ch;
7858             }
7859             else
7860                 diff = sTable[idx - sTable[idx].n].ch - rhs;
7861         }
7862         else if (idx2 != EMPTY_CASE_TRIE)
7863         {
7864             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7865         }
7866         // one of chars is not cased at all
7867         return diff;
7868     }
7869     return int(r2.empty) - 1;
7870 }
7871 
7872 ///
7873 @safe @nogc pure nothrow unittest
7874 {
7875     assert(sicmp("Август", "авгусТ") == 0);
7876     // Greek also works as long as there is no 1:M mapping in sight
7877     assert(sicmp("ΌΎ", "όύ") == 0);
7878     // things like the following won't get matched as equal
7879     // Greek small letter iota with dialytika and tonos
7880     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7881 
7882     // while icmp has no problem with that
7883     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
7884     assert(icmp("ΌΎ", "όύ") == 0);
7885 }
7886 
7887 // overloads for the most common cases to reduce compile time
7888 @safe @nogc pure nothrow
7889 {
7890     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
7891     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
7892 
7893     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
7894     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
7895 
7896     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
7897     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7898 }
7899 
7900 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7901 {
7902     import std.algorithm.searching : skipOver;
7903     import std.internal.unicode_tables : fullCaseTable; // generated file
7904     alias fTable = fullCaseTable;
7905     size_t idx = fullCaseTrie[lhs];
7906     // fullCaseTrie is packed index table
7907     if (idx == EMPTY_CASE_TRIE)
7908         return lhs;
7909     immutable start = idx - fTable[idx].n;
7910     immutable end = fTable[idx].size + start;
7911     assert(fTable[start].entry_len == 1);
7912     for (idx=start; idx<end; idx++)
7913     {
7914         auto entryLen = fTable[idx].entry_len;
7915         if (entryLen == 1)
7916         {
7917             if (fTable[idx].seq[0] == rhs)
7918             {
7919                 return 0;
7920             }
7921         }
7922         else
7923         {// OK it's a long chunk, like 'ss' for German
7924             dstring seq = fTable[idx].seq[0 .. entryLen];
7925             if (rhs == seq[0]
7926                 && rtail.skipOver(seq[1..$]))
7927             {
7928                 // note that this path modifies rtail
7929                 // iff we managed to get there
7930                 return 0;
7931             }
7932         }
7933     }
7934     return fTable[start].seq[0]; // new remapped character for accurate diffs
7935 }
7936 
7937 /++
7938     Does case insensitive comparison of `r1` and `r2`.
7939     Follows the rules of full case-folding mapping.
7940     This includes matching as equal german ß with "ss" and
7941     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7942     The cost of `icmp` being pedantically correct is
7943     slightly worse performance.
7944 
7945     Params:
7946         r1 = a forward range of characters
7947         r2 = a forward range of characters
7948 
7949     Returns:
7950         An `int` that is 0 if the strings match,
7951         &lt;0 if `str1` is lexicographically "less" than `str2`,
7952         &gt;0 if `str1` is lexicographically "greater" than `str2`
7953 
7954     See_Also:
7955         $(LREF sicmp)
7956         $(REF cmp, std,algorithm,comparison)
7957 +/
7958 int icmp(S1, S2)(S1 r1, S2 r2)
7959 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7960     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7961 {
7962     import std.range.primitives : isInfinite;
7963     import std.traits : isDynamicArray;
7964     import std.utf : byDchar;
7965     static import std.ascii;
7966 
7967     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7968         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7969         && !(isInfinite!S1 && isInfinite!S2)
7970         && __traits(compiles,
7971             {
7972                 size_t s = size_t.max / 2;
7973                 r1 = r1[s .. $];
7974                 r2 = r2[s .. $];
7975             }))
7976     {{
7977         // ASCII optimization for dynamic arrays & similar.
7978         size_t i = 0;
7979         static if (isInfinite!S1)
7980             immutable end = r2.length;
7981         else static if (isInfinite!S2)
7982             immutable end = r1.length;
7983         else
7984             immutable end = r1.length > r2.length ? r2.length : r1.length;
7985         for (; i < end; ++i)
7986         {
7987             auto lhs = r1[i];
7988             auto rhs = r2[i];
7989             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7990             if (lhs == rhs) continue;
7991             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7992             if (lowDiff) return lowDiff;
7993         }
7994         static if (isInfinite!S1)
7995             return 1;
7996         else static if (isInfinite!S2)
7997             return -1;
7998         else
7999             return (r1.length > r2.length) - (r2.length > r1.length);
8000 
8001     NonAsciiPath:
8002         r1 = r1[i .. $];
8003         r2 = r2[i .. $];
8004         // Fall through to standard case.
8005     }}
8006 
8007     auto str1 = r1.byDchar;
8008     auto str2 = r2.byDchar;
8009 
8010     for (;;)
8011     {
8012         if (str1.empty)
8013             return str2.empty ? 0 : -1;
8014         immutable lhs = str1.front;
8015         if (str2.empty)
8016             return 1;
8017         immutable rhs = str2.front;
8018         str1.popFront();
8019         str2.popFront();
8020         if (!(lhs - rhs))
8021             continue;
8022         // first try to match lhs to <rhs,right-tail> sequence
8023         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8024         if (!cmpLR)
8025             continue;
8026         // then rhs to <lhs,left-tail> sequence
8027         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8028         if (!cmpRL)
8029             continue;
8030         // cmpXX contain remapped codepoints
8031         // to obtain stable ordering of icmp
8032         return cmpLR - cmpRL;
8033     }
8034 }
8035 
8036 ///
8037 @safe @nogc pure nothrow unittest
8038 {
8039     assert(icmp("Rußland", "Russland") == 0);
8040     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8041 }
8042 
8043 /**
8044  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8045  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8046  */
8047 @safe @nogc nothrow pure unittest
8048 {
8049     import std.utf : byDchar;
8050 
8051     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8052     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8053 }
8054 
8055 // test different character types
8056 @safe unittest
8057 {
8058     assert(icmp("Rußland", "Russland") == 0);
8059     assert(icmp("Rußland"w, "Russland") == 0);
8060     assert(icmp("Rußland", "Russland"w) == 0);
8061     assert(icmp("Rußland"w, "Russland"w) == 0);
8062     assert(icmp("Rußland"d, "Russland"w) == 0);
8063     assert(icmp("Rußland"w, "Russland"d) == 0);
8064 }
8065 
8066 // overloads for the most common cases to reduce compile time
8067 @safe @nogc pure nothrow
8068 {
8069     int icmp(const(char)[] str1, const(char)[] str2)
8070     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8071     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8072     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8073     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8074     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8075 }
8076 
8077 @safe unittest
8078 {
8079     import std.algorithm.sorting : sort;
8080     import std.conv : to;
8081     import std.exception : assertCTFEable;
8082     assertCTFEable!(
8083     {
8084     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8085     {{
8086         static foreach (S1; AliasSeq!(string, wstring, dstring))
8087         static foreach (S2; AliasSeq!(string, wstring, dstring))
8088         {
8089             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8090             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8091             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8092             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8093             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8094             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8095             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8096             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8097             // Check example:
8098             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8099             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8100         }
8101         // check that the order is properly agnostic to the case
8102         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8103         sort!((a,b) => cfunc(a,b) < 0)(strs);
8104         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8105     }}
8106     assert(icmp("ßb", "ssa") > 0);
8107     // Check example:
8108     assert(icmp("Russland", "Rußland") == 0);
8109     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8110     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8111     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8112     // https://issues.dlang.org/show_bug.cgi?id=11057
8113     assert( icmp("K", "L") < 0 );
8114     });
8115 }
8116 
8117 // https://issues.dlang.org/show_bug.cgi?id=17372
8118 @safe pure unittest
8119 {
8120     import std.algorithm.iteration : joiner, map;
8121     import std.algorithm.sorting : sort;
8122     import std.array : array;
8123     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8124 }
8125 
8126 // This is package(std) for the moment to be used as a support tool for std.regex
8127 // It needs a better API
8128 /*
8129     Return a range of all $(CODEPOINTS) that casefold to
8130     and from this `ch`.
8131 */
8132 package(std) auto simpleCaseFoldings(dchar ch) @safe
8133 {
8134     import std.internal.unicode_tables : simpleCaseTable; // generated file
8135     alias sTable = simpleCaseTable;
8136     static struct Range
8137     {
8138     @safe pure nothrow:
8139         uint idx; //if == uint.max, then read c.
8140         union
8141         {
8142             dchar c; // == 0 - empty range
8143             uint len;
8144         }
8145         @property bool isSmall() const { return idx == uint.max; }
8146 
8147         this(dchar ch)
8148         {
8149             idx = uint.max;
8150             c = ch;
8151         }
8152 
8153         this(uint start, uint size)
8154         {
8155             idx = start;
8156             len = size;
8157         }
8158 
8159         @property dchar front() const
8160         {
8161             assert(!empty);
8162             if (isSmall)
8163             {
8164                 return c;
8165             }
8166             auto ch = sTable[idx].ch;
8167             return ch;
8168         }
8169 
8170         @property bool empty() const
8171         {
8172             if (isSmall)
8173             {
8174                 return c == 0;
8175             }
8176             return len == 0;
8177         }
8178 
8179         @property size_t length() const
8180         {
8181             if (isSmall)
8182             {
8183                 return c == 0 ? 0 : 1;
8184             }
8185             return len;
8186         }
8187 
8188         void popFront()
8189         {
8190             if (isSmall)
8191                 c = 0;
8192             else
8193             {
8194                 idx++;
8195                 len--;
8196             }
8197         }
8198     }
8199     immutable idx = simpleCaseTrie[ch];
8200     if (idx == EMPTY_CASE_TRIE)
8201         return Range(ch);
8202     auto entry = sTable[idx];
8203     immutable start = idx - entry.n;
8204     return Range(start, entry.size);
8205 }
8206 
8207 @system unittest
8208 {
8209     import std.algorithm.comparison : equal;
8210     import std.algorithm.searching : canFind;
8211     import std.array : array;
8212     import std.exception : assertCTFEable;
8213     assertCTFEable!((){
8214         auto r = simpleCaseFoldings('Э').array;
8215         assert(r.length == 2);
8216         assert(r.canFind('э') && r.canFind('Э'));
8217         auto sr = simpleCaseFoldings('~');
8218         assert(sr.equal("~"));
8219         //A with ring above - casefolds to the same bucket as Angstrom sign
8220         sr = simpleCaseFoldings('Å');
8221         assert(sr.length == 3);
8222         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8223     });
8224 }
8225 
8226 /++
8227     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8228 +/
8229 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8230 {
8231     return combiningClassTrie[ch];
8232 }
8233 
8234 ///
8235 @safe unittest
8236 {
8237     // shorten the code
8238     alias CC = combiningClass;
8239 
8240     // combining tilda
8241     assert(CC('\u0303') == 230);
8242     // combining ring below
8243     assert(CC('\u0325') == 220);
8244     // the simple consequence is that  "tilda" should be
8245     // placed after a "ring below" in a sequence
8246 }
8247 
8248 @safe pure nothrow @nogc unittest
8249 {
8250     foreach (ch; 0 .. 0x80)
8251         assert(combiningClass(ch) == 0);
8252     assert(combiningClass('\u05BD') == 22);
8253     assert(combiningClass('\u0300') == 230);
8254     assert(combiningClass('\u0317') == 220);
8255     assert(combiningClass('\u1939') == 222);
8256 }
8257 
8258 /// Unicode character decomposition type.
8259 enum UnicodeDecomposition {
8260     /// Canonical decomposition. The result is canonically equivalent sequence.
8261     Canonical,
8262     /**
8263          Compatibility decomposition. The result is compatibility equivalent sequence.
8264          Note: Compatibility decomposition is a $(B lossy) conversion,
8265          typically suitable only for fuzzy matching and internal processing.
8266     */
8267     Compatibility
8268 }
8269 
8270 /**
8271     Shorthand aliases for character decomposition type, passed as a
8272     template parameter to $(LREF decompose).
8273 */
8274 enum {
8275     Canonical = UnicodeDecomposition.Canonical,
8276     Compatibility = UnicodeDecomposition.Compatibility
8277 }
8278 
8279 /++
8280     Try to canonically compose 2 $(CHARACTERS).
8281     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8282 
8283     The assumption is that `first` comes before `second` in the original text,
8284     usually meaning that the first is a starter.
8285 
8286     Note: Hangul syllables are not covered by this function.
8287     See `composeJamo` below.
8288 +/
8289 public dchar compose(dchar first, dchar second) pure nothrow @safe
8290 {
8291     import std.algorithm.iteration : map;
8292     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8293     import std.range : assumeSorted;
8294     immutable packed = compositionJumpTrie[first];
8295     if (packed == ushort.max)
8296         return dchar.init;
8297     // unpack offset and length
8298     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8299     // TODO: optimize this micro binary search (no more then 4-5 steps)
8300     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8301     immutable target = r.lowerBound(second).length;
8302     if (target == cnt)
8303         return dchar.init;
8304     immutable entry = compositionTable[idx+target];
8305     if (entry.rhs != second)
8306         return dchar.init;
8307     return entry.composed;
8308 }
8309 
8310 ///
8311 @safe unittest
8312 {
8313     assert(compose('A','\u0308') == '\u00C4');
8314     assert(compose('A', 'B') == dchar.init);
8315     assert(compose('C', '\u0301') == '\u0106');
8316     // note that the starter is the first one
8317     // thus the following doesn't compose
8318     assert(compose('\u0308', 'A') == dchar.init);
8319 }
8320 
8321 /++
8322     Returns a full $(S_LINK Canonical decomposition, Canonical)
8323     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8324     decomposition of $(CHARACTER) `ch`.
8325     If no decomposition is available returns a $(LREF Grapheme)
8326     with the `ch` itself.
8327 
8328     Note:
8329     This function also decomposes hangul syllables
8330     as prescribed by the standard.
8331 
8332     See_Also: $(LREF decomposeHangul) for a restricted version
8333     that takes into account only hangul syllables  but
8334     no other decompositions.
8335 +/
8336 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8337 {
8338     import std.algorithm.searching : until;
8339     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8340     static if (decompType == Canonical)
8341     {
8342         alias table = decompCanonTable;
8343         alias mapping = canonMappingTrie;
8344     }
8345     else static if (decompType == Compatibility)
8346     {
8347         alias table = decompCompatTable;
8348         alias mapping = compatMappingTrie;
8349     }
8350     immutable idx = mapping[ch];
8351     if (!idx) // not found, check hangul arithmetic decomposition
8352         return decomposeHangul(ch);
8353     auto decomp = table[idx..$].until(0);
8354     return Grapheme(decomp);
8355 }
8356 
8357 ///
8358 @system unittest
8359 {
8360     import std.algorithm.comparison : equal;
8361 
8362     assert(compose('A','\u0308') == '\u00C4');
8363     assert(compose('A', 'B') == dchar.init);
8364     assert(compose('C', '\u0301') == '\u0106');
8365     // note that the starter is the first one
8366     // thus the following doesn't compose
8367     assert(compose('\u0308', 'A') == dchar.init);
8368 
8369     assert(decompose('Ĉ')[].equal("C\u0302"));
8370     assert(decompose('D')[].equal("D"));
8371     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8372     assert(decompose!Compatibility('¹')[].equal("1"));
8373 }
8374 
8375 //----------------------------------------------------------------------------
8376 // Hangul specific composition/decomposition
8377 enum jamoSBase = 0xAC00;
8378 enum jamoLBase = 0x1100;
8379 enum jamoVBase = 0x1161;
8380 enum jamoTBase = 0x11A7;
8381 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8382 enum jamoNCount = jamoVCount * jamoTCount;
8383 enum jamoSCount = jamoLCount * jamoNCount;
8384 
8385 // Tests if `ch` is a Hangul leading consonant jamo.
8386 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8387 {
8388     // first cmp rejects ~ 1M code points above leading jamo range
8389     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8390 }
8391 
8392 // Tests if `ch` is a Hangul vowel jamo.
8393 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8394 {
8395     // first cmp rejects ~ 1M code points above trailing jamo range
8396     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8397     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8398 }
8399 
8400 // Tests if `ch` is a Hangul trailnig consonant jamo.
8401 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8402 {
8403     // first cmp rejects ~ 1M code points above vowel range
8404     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8405 }
8406 
8407 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8408 {
8409     int idxS = cast(int) ch - jamoSBase;
8410     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8411 }
8412 
8413 // internal helper: compose hangul syllables leaving dchar.init in holes
8414 void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe
8415 {
8416     for (size_t idx = 0; idx + 1 < seq.length; )
8417     {
8418         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8419         {
8420             immutable int indexL = seq[idx] - jamoLBase;
8421             immutable int indexV = seq[idx+1] - jamoVBase;
8422             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8423             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8424             {
8425                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8426                 seq[idx+1] = dchar.init;
8427                 seq[idx+2] = dchar.init;
8428                 idx += 3;
8429             }
8430             else
8431             {
8432                 seq[idx] = jamoSBase + indexLV;
8433                 seq[idx+1] = dchar.init;
8434                 idx += 2;
8435             }
8436         }
8437         else
8438             idx++;
8439     }
8440 }
8441 
8442 //----------------------------------------------------------------------------
8443 public:
8444 
8445 /**
8446     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8447     then this function returns $(LREF Grapheme) containing only `ch` as is.
8448 */
8449 Grapheme decomposeHangul(dchar ch) @safe
8450 {
8451     immutable idxS = cast(int) ch - jamoSBase;
8452     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8453     immutable idxL = idxS / jamoNCount;
8454     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8455     immutable idxT = idxS % jamoTCount;
8456 
8457     immutable partL = jamoLBase + idxL;
8458     immutable partV = jamoVBase + idxV;
8459     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8460         return Grapheme(partL, partV, jamoTBase + idxT);
8461     else // <L, V> decomposition
8462         return Grapheme(partL, partV);
8463 }
8464 
8465 ///
8466 @system unittest
8467 {
8468     import std.algorithm.comparison : equal;
8469     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8470 }
8471 
8472 /++
8473     Try to compose hangul syllable out of a leading consonant (`lead`),
8474     a `vowel` and optional `trailing` consonant jamos.
8475 
8476     On success returns the composed LV or LVT hangul syllable.
8477 
8478     If any of `lead` and `vowel` are not a valid hangul jamo
8479     of the respective $(CHARACTER) class returns dchar.init.
8480 +/
8481 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8482 {
8483     if (!isJamoL(lead))
8484         return dchar.init;
8485     immutable indexL = lead - jamoLBase;
8486     if (!isJamoV(vowel))
8487         return dchar.init;
8488     immutable indexV = vowel - jamoVBase;
8489     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8490     immutable dchar syllable = jamoSBase + indexLV;
8491     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8492 }
8493 
8494 ///
8495 @safe unittest
8496 {
8497     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8498     // leaving out T-vowel, or passing any codepoint
8499     // that is not trailing consonant composes an LV-syllable
8500     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8501     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8502     assert(composeJamo('\u1111', 'A') == dchar.init);
8503     assert(composeJamo('A', '\u1171') == dchar.init);
8504 }
8505 
8506 @system unittest
8507 {
8508     import std.algorithm.comparison : equal;
8509     import std.conv : text;
8510 
8511     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8512     {
8513         Grapheme g = decompose!T(ch);
8514         assert(equal(g[], r), text(g[], " vs ", r));
8515     }
8516     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8517     testDecomp!Canonical('\uF907', "\u9F9C");
8518     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8519     testDecomp!Compatibility('\uA7F9', "\u0153");
8520 
8521     // check examples
8522     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8523     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8524     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8525     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8526     assert(composeJamo('\u1111', 'A') == dchar.init);
8527     assert(composeJamo('A', '\u1171') == dchar.init);
8528 }
8529 
8530 /**
8531     Enumeration type for normalization forms,
8532     passed as template parameter for functions like $(LREF normalize).
8533 */
8534 enum NormalizationForm {
8535     NFC,
8536     NFD,
8537     NFKC,
8538     NFKD
8539 }
8540 
8541 
8542 enum {
8543     /**
8544         Shorthand aliases from values indicating normalization forms.
8545     */
8546     NFC = NormalizationForm.NFC,
8547     ///ditto
8548     NFD = NormalizationForm.NFD,
8549     ///ditto
8550     NFKC = NormalizationForm.NFKC,
8551     ///ditto
8552     NFKD = NormalizationForm.NFKD
8553 }
8554 
8555 /++
8556     Returns `input` string normalized to the chosen form.
8557     Form C is used by default.
8558 
8559     For more information on normalization forms see
8560     the $(S_LINK Normalization, normalization section).
8561 
8562     Note:
8563     In cases where the string in question is already normalized,
8564     it is returned unmodified and no memory allocation happens.
8565 +/
8566 inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
8567 {
8568     import std.algorithm.mutation : SwapStrategy;
8569     import std.algorithm.sorting : sort;
8570     import std.array : appender;
8571     import std.range : zip;
8572 
8573     auto anchors = splitNormalized!norm(input);
8574     if (anchors[0] == input.length && anchors[1] == input.length)
8575         return input;
8576     dchar[] decomposed;
8577     decomposed.reserve(31);
8578     ubyte[] ccc;
8579     ccc.reserve(31);
8580     auto app = appender!(C[])();
8581     do
8582     {
8583         app.put(input[0 .. anchors[0]]);
8584         foreach (dchar ch; input[anchors[0]..anchors[1]])
8585             static if (norm == NFD || norm == NFC)
8586             {
8587                 foreach (dchar c; decompose!Canonical(ch)[])
8588                     decomposed ~= c;
8589             }
8590             else // NFKD & NFKC
8591             {
8592                 foreach (dchar c; decompose!Compatibility(ch)[])
8593                     decomposed ~= c;
8594             }
8595         ccc.length = decomposed.length;
8596         size_t firstNonStable = 0;
8597         ubyte lastClazz = 0;
8598 
8599         foreach (idx, dchar ch; decomposed)
8600         {
8601             immutable clazz = combiningClass(ch);
8602             ccc[idx] = clazz;
8603             if (clazz == 0 && lastClazz != 0)
8604             {
8605                 // found a stable code point after unstable ones
8606                 sort!("a[0] < b[0]", SwapStrategy.stable)
8607                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8608                 firstNonStable = decomposed.length;
8609             }
8610             else if (clazz != 0 && lastClazz == 0)
8611             {
8612                 // found first unstable code point after stable ones
8613                 firstNonStable = idx;
8614             }
8615             lastClazz = clazz;
8616         }
8617         sort!("a[0] < b[0]", SwapStrategy.stable)
8618             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8619         static if (norm == NFC || norm == NFKC)
8620         {
8621             import std.algorithm.searching : countUntil;
8622             auto first = countUntil(ccc, 0);
8623             if (first >= 0) // no starters?? no recomposition
8624             {
8625                 for (;;)
8626                 {
8627                     immutable second = recompose(first, decomposed, ccc);
8628                     if (second == decomposed.length)
8629                         break;
8630                     first = second;
8631                 }
8632                 // 2nd pass for hangul syllables
8633                 hangulRecompose(decomposed);
8634             }
8635         }
8636         static if (norm == NFD || norm == NFKD)
8637             app.put(decomposed);
8638         else
8639         {
8640             import std.algorithm.mutation : remove;
8641             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8642             app.put(decomposed[0 .. clean.length]);
8643         }
8644         // reset variables
8645         decomposed.length = 0;
8646         () @trusted {
8647             decomposed.assumeSafeAppend();
8648             ccc.length = 0;
8649             ccc.assumeSafeAppend();
8650         } ();
8651         input = input[anchors[1]..$];
8652         // and move on
8653         anchors = splitNormalized!norm(input);
8654     }while (anchors[0] != input.length);
8655     app.put(input[0 .. anchors[0]]);
8656     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8657 }
8658 
8659 ///
8660 @safe unittest
8661 {
8662     // any encoding works
8663     wstring greet = "Hello world";
8664     assert(normalize(greet) is greet); // the same exact slice
8665 
8666     // An example of a character with all 4 forms being different:
8667     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8668     assert(normalize!NFC("ϓ") == "\u03D3");
8669     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8670     assert(normalize!NFKC("ϓ") == "\u038E");
8671     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8672 }
8673 
8674 @safe unittest
8675 {
8676     import std.conv : text;
8677 
8678     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8679     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8680     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8681 
8682     // check example
8683 
8684     // any encoding works
8685     wstring greet = "Hello world";
8686     assert(normalize(greet) is greet); // the same exact slice
8687 
8688     // An example of a character with all 4 forms being different:
8689     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8690     assert(normalize!NFC("ϓ") == "\u03D3");
8691     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8692     assert(normalize!NFKC("ϓ") == "\u038E");
8693     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8694 }
8695 
8696 // canonically recompose given slice of code points, works in-place and mutates data
8697 private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe
8698 {
8699     assert(input.length == ccc.length);
8700     int accumCC = -1;// so that it's out of 0 .. 255 range
8701     // writefln("recomposing %( %04x %)", input);
8702     // first one is always a starter thus we start at i == 1
8703     size_t i = start+1;
8704     for (; ; )
8705     {
8706         if (i == input.length)
8707             break;
8708         immutable curCC = ccc[i];
8709         // In any character sequence beginning with a starter S
8710         // a character C is blocked from S if and only if there
8711         // is some character B between S and C, and either B
8712         // is a starter or it has the same or higher combining class as C.
8713         //------------------------
8714         // Applying to our case:
8715         // S is input[0]
8716         // accumCC is the maximum CCC of characters between C and S,
8717         //     as ccc are sorted
8718         // C is input[i]
8719 
8720         if (curCC > accumCC)
8721         {
8722             immutable comp = compose(input[start], input[i]);
8723             if (comp != dchar.init)
8724             {
8725                 input[start] = comp;
8726                 input[i] = dchar.init;// put a sentinel
8727                 // current was merged so its CCC shouldn't affect
8728                 // composing with the next one
8729             }
8730             else
8731             {
8732                 // if it was a starter then accumCC is now 0, end of loop
8733                 accumCC = curCC;
8734                 if (accumCC == 0)
8735                     break;
8736             }
8737         }
8738         else
8739         {
8740             // ditto here
8741             accumCC = curCC;
8742             if (accumCC == 0)
8743                 break;
8744         }
8745         i++;
8746     }
8747     return i;
8748 }
8749 
8750 // returns tuple of 2 indexes that delimit:
8751 // normalized text, piece that needs normalization and
8752 // the rest of input starting with stable code point
8753 private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
8754 {
8755     import std.typecons : tuple;
8756     ubyte lastCC = 0;
8757 
8758     foreach (idx, dchar ch; input)
8759     {
8760         static if (norm == NFC)
8761             if (ch < 0x0300)
8762             {
8763                 lastCC = 0;
8764                 continue;
8765             }
8766         immutable ubyte CC = combiningClass(ch);
8767         if (lastCC > CC && CC != 0)
8768         {
8769             return seekStable!norm(idx, input);
8770         }
8771 
8772         if (notAllowedIn!norm(ch))
8773         {
8774            return seekStable!norm(idx, input);
8775         }
8776         lastCC = CC;
8777     }
8778     return tuple(input.length, input.length);
8779 }
8780 
8781 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8782 {
8783     import std.typecons : tuple;
8784     import std.utf : codeLength;
8785 
8786     auto br = input[0 .. idx];
8787     size_t region_start = 0;// default
8788     for (;;)
8789     {
8790         if (br.empty)// start is 0
8791             break;
8792         dchar ch = br.back;
8793         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8794         {
8795             region_start = br.length - codeLength!C(ch);
8796             break;
8797         }
8798         br.popFront();
8799     }
8800     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8801     size_t region_end=input.length;// end is $ by default
8802     foreach (i, dchar ch; input[idx..$])
8803     {
8804         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8805         {
8806             region_end = i+idx;
8807             break;
8808         }
8809     }
8810     // writeln("Region to normalize: ", input[region_start .. region_end]);
8811     return tuple(region_start, region_end);
8812 }
8813 
8814 /**
8815     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8816     form `norm`.
8817 */
8818 public bool allowedIn(NormalizationForm norm)(dchar ch)
8819 {
8820     return !notAllowedIn!norm(ch);
8821 }
8822 
8823 ///
8824 @safe unittest
8825 {
8826     // e.g. Cyrillic is always allowed, so is ASCII
8827     assert(allowedIn!NFC('я'));
8828     assert(allowedIn!NFD('я'));
8829     assert(allowedIn!NFKC('я'));
8830     assert(allowedIn!NFKD('я'));
8831     assert(allowedIn!NFC('Z'));
8832 }
8833 
8834 // not user friendly name but more direct
8835 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8836 {
8837     static if (norm == NFC)
8838         alias qcTrie = nfcQCTrie;
8839     else static if (norm == NFD)
8840         alias qcTrie = nfdQCTrie;
8841     else static if (norm == NFKC)
8842         alias qcTrie = nfkcQCTrie;
8843     else static if (norm == NFKD)
8844         alias qcTrie = nfkdQCTrie;
8845     else
8846         static assert("Unknown normalization form "~norm);
8847     return qcTrie[ch];
8848 }
8849 
8850 @safe unittest
8851 {
8852     assert(allowedIn!NFC('я'));
8853     assert(allowedIn!NFD('я'));
8854     assert(allowedIn!NFKC('я'));
8855     assert(allowedIn!NFKD('я'));
8856     assert(allowedIn!NFC('Z'));
8857 }
8858 
8859 }
8860 
8861 version (std_uni_bootstrap)
8862 {
8863     // old version used for bootstrapping of gen_uni.d that generates
8864     // up to date optimal versions of all of isXXX functions
8865     @safe pure nothrow @nogc public bool isWhite(dchar c)
8866     {
8867         import std.ascii : isWhite;
8868         return isWhite(c) ||
8869                c == lineSep || c == paraSep ||
8870                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8871                (c >= '\u2000' && c <= '\u200A') ||
8872                c == '\u202F' || c == '\u205F' || c == '\u3000';
8873     }
8874 }
8875 else
8876 {
8877 
8878 // trusted -> avoid bounds check
8879 @trusted pure nothrow @nogc private
8880 {
8881     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8882 
8883     // hide template instances behind functions
8884     // https://issues.dlang.org/show_bug.cgi?id=13232
8885     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
8886     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
8887     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8888 
8889     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
8890     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
8891     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8892 
8893     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
8894     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
8895     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8896 }
8897 
8898 public:
8899 
8900 /++
8901     Whether or not `c` is a Unicode whitespace $(CHARACTER).
8902     (general Unicode category: Part of C0(tab, vertical tab, form feed,
8903     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8904 +/
8905 @safe pure nothrow @nogc
8906 public bool isWhite(dchar c)
8907 {
8908     import std.internal.unicode_tables : isWhiteGen; // generated file
8909     return isWhiteGen(c); // call pregenerated binary search
8910 }
8911 
8912 /++
8913     Return whether `c` is a Unicode lowercase $(CHARACTER).
8914 +/
8915 @safe pure nothrow @nogc
8916 bool isLower(dchar c)
8917 {
8918     import std.ascii : isLower, isASCII;
8919     if (isASCII(c))
8920         return isLower(c);
8921     return lowerCaseTrie[c];
8922 }
8923 
8924 @safe unittest
8925 {
8926     import std.ascii : isLower;
8927     foreach (v; 0 .. 0x80)
8928         assert(isLower(v) == .isLower(v));
8929     assert(.isLower('я'));
8930     assert(.isLower('й'));
8931     assert(!.isLower('Ж'));
8932     // Greek HETA
8933     assert(!.isLower('\u0370'));
8934     assert(.isLower('\u0371'));
8935     assert(!.isLower('\u039C')); // capital MU
8936     assert(.isLower('\u03B2')); // beta
8937     // from extended Greek
8938     assert(!.isLower('\u1F18'));
8939     assert(.isLower('\u1F00'));
8940     foreach (v; unicode.lowerCase.byCodepoint)
8941         assert(.isLower(v) && !isUpper(v));
8942 }
8943 
8944 
8945 /++
8946     Return whether `c` is a Unicode uppercase $(CHARACTER).
8947 +/
8948 @safe pure nothrow @nogc
8949 bool isUpper(dchar c)
8950 {
8951     import std.ascii : isUpper, isASCII;
8952     if (isASCII(c))
8953         return isUpper(c);
8954     return upperCaseTrie[c];
8955 }
8956 
8957 @safe unittest
8958 {
8959     import std.ascii : isLower;
8960     foreach (v; 0 .. 0x80)
8961         assert(isLower(v) == .isLower(v));
8962     assert(!isUpper('й'));
8963     assert(isUpper('Ж'));
8964     // Greek HETA
8965     assert(isUpper('\u0370'));
8966     assert(!isUpper('\u0371'));
8967     assert(isUpper('\u039C')); // capital MU
8968     assert(!isUpper('\u03B2')); // beta
8969     // from extended Greek
8970     assert(!isUpper('\u1F00'));
8971     assert(isUpper('\u1F18'));
8972     foreach (v; unicode.upperCase.byCodepoint)
8973         assert(isUpper(v) && !.isLower(v));
8974 }
8975 
8976 
8977 //TODO: Hidden for now, needs better API.
8978 //Other transforms could use better API as well, but this one is a new primitive.
8979 @safe pure nothrow @nogc
8980 private dchar toTitlecase(dchar c)
8981 {
8982     // optimize ASCII case
8983     if (c < 0xAA)
8984     {
8985         if (c < 'a')
8986             return c;
8987         if (c <= 'z')
8988             return c - 32;
8989         return c;
8990     }
8991     size_t idx = toTitleSimpleIndex(c);
8992     if (idx != ushort.max)
8993     {
8994         return toTitleTab(idx);
8995     }
8996     return c;
8997 }
8998 
8999 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9000 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9001 
9002 // generic toUpper/toLower on whole string, creates new or returns as is
9003 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9004 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9005 {
9006     import std.array : appender, array;
9007     import std.ascii : isASCII;
9008     import std.utf : byDchar, codeLength;
9009 
9010     alias C = ElementEncodingType!S;
9011 
9012     auto r = s.byDchar;
9013     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9014     {
9015         auto cOuter = r.front;
9016         ushort idx = indexFn(cOuter);
9017         if (idx == ushort.max)
9018             continue;
9019         auto result = appender!(C[])();
9020         result.reserve(s.length);
9021         result.put(s[0 .. i]);
9022         foreach (dchar c; s[i .. $].byDchar)
9023         {
9024             if (c.isASCII)
9025             {
9026                 result.put(asciiConvert(c));
9027             }
9028             else
9029             {
9030                 idx = indexFn(c);
9031                 if (idx == ushort.max)
9032                     result.put(c);
9033                 else if (idx < maxIdx)
9034                 {
9035                     c = tableFn(idx);
9036                     result.put(c);
9037                 }
9038                 else
9039                 {
9040                     auto val = tableFn(idx);
9041                     // unpack length + codepoint
9042                     immutable uint len = val >> 24;
9043                     result.put(cast(dchar)(val & 0xFF_FFFF));
9044                     foreach (j; idx+1 .. idx+len)
9045                         result.put(tableFn(j));
9046                 }
9047             }
9048         }
9049         return result.data;
9050     }
9051 
9052     static if (isSomeString!S)
9053         return s;
9054     else
9055         return s.array;
9056 }
9057 
9058 // https://issues.dlang.org/show_bug.cgi?id=12428
9059 @safe unittest
9060 {
9061     import std.array : replicate;
9062     auto s = "abcdefghij".replicate(300);
9063     s = s[0 .. 10];
9064 
9065     toUpper(s);
9066 
9067     assert(s == "abcdefghij");
9068 }
9069 
9070 // https://issues.dlang.org/show_bug.cgi?id=18993
9071 @safe unittest
9072 {
9073     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9074 }
9075 
9076 
9077 // generic toUpper/toLower on whole range, returns range
9078 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9079     // Accept range of dchar's
9080 if (isInputRange!Range &&
9081     isSomeChar!(ElementEncodingType!Range) &&
9082     ElementEncodingType!Range.sizeof == dchar.sizeof)
9083 {
9084     static struct ToCaserImpl
9085     {
9086         @property bool empty()
9087         {
9088             return !nLeft && r.empty;
9089         }
9090 
9091         @property auto front()
9092         {
9093             import std.ascii : isASCII;
9094 
9095             if (!nLeft)
9096             {
9097                 dchar c = r.front;
9098                 if (c.isASCII)
9099                 {
9100                     buf[0] = asciiConvert(c);
9101                     nLeft = 1;
9102                 }
9103                 else
9104                 {
9105                     const idx = indexFn(c);
9106                     if (idx == ushort.max)
9107                     {
9108                         buf[0] = c;
9109                         nLeft = 1;
9110                     }
9111                     else if (idx < maxIdx)
9112                     {
9113                         buf[0] = tableFn(idx);
9114                         nLeft = 1;
9115                     }
9116                     else
9117                     {
9118                         immutable val = tableFn(idx);
9119                         // unpack length + codepoint
9120                         nLeft = val >> 24;
9121                         if (nLeft == 0)
9122                             nLeft = 1;
9123                         assert(nLeft <= buf.length);
9124                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9125                         foreach (j; 1 .. nLeft)
9126                             buf[nLeft - j - 1] = tableFn(idx + j);
9127                     }
9128                 }
9129             }
9130             return buf[nLeft - 1];
9131         }
9132 
9133         void popFront()
9134         {
9135             if (!nLeft)
9136                 front;
9137             assert(nLeft);
9138             --nLeft;
9139             if (!nLeft)
9140                 r.popFront();
9141         }
9142 
9143         static if (isForwardRange!Range)
9144         {
9145             @property auto save()
9146             {
9147                 auto ret = this;
9148                 ret.r = r.save;
9149                 return ret;
9150             }
9151         }
9152 
9153       private:
9154         Range r;
9155         uint nLeft;
9156         dchar[3] buf = void;
9157     }
9158 
9159     return ToCaserImpl(str);
9160 }
9161 
9162 /*********************
9163  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9164  * or a string to upper or lower case.
9165  *
9166  * Does not allocate memory.
9167  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9168  * are treated as $(REF replacementDchar, std,utf).
9169  *
9170  * Params:
9171  *      str = string or range of characters
9172  *
9173  * Returns:
9174  *      an input range of `dchar`s
9175  *
9176  * See_Also:
9177  *      $(LREF toUpper), $(LREF toLower)
9178  */
9179 
9180 auto asLowerCase(Range)(Range str)
9181 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9182     !isConvertibleToString!Range)
9183 {
9184     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9185     {
9186         import std.utf : byDchar;
9187 
9188         // Decode first
9189         return asLowerCase(str.byDchar);
9190     }
9191     else
9192     {
9193         static import std.ascii;
9194         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9195     }
9196 }
9197 
9198 /// ditto
9199 auto asUpperCase(Range)(Range str)
9200 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9201     !isConvertibleToString!Range)
9202 {
9203     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9204     {
9205         import std.utf : byDchar;
9206 
9207         // Decode first
9208         return asUpperCase(str.byDchar);
9209     }
9210     else
9211     {
9212         static import std.ascii;
9213         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9214     }
9215 }
9216 
9217 ///
9218 @safe pure unittest
9219 {
9220     import std.algorithm.comparison : equal;
9221 
9222     assert("hEllo".asUpperCase.equal("HELLO"));
9223 }
9224 
9225 // explicitly undocumented
9226 auto asLowerCase(Range)(auto ref Range str)
9227 if (isConvertibleToString!Range)
9228 {
9229     import std.traits : StringTypeOf;
9230     return asLowerCase!(StringTypeOf!Range)(str);
9231 }
9232 
9233 // explicitly undocumented
9234 auto asUpperCase(Range)(auto ref Range str)
9235 if (isConvertibleToString!Range)
9236 {
9237     import std.traits : StringTypeOf;
9238     return asUpperCase!(StringTypeOf!Range)(str);
9239 }
9240 
9241 @safe unittest
9242 {
9243     static struct TestAliasedString
9244     {
9245         string get() @safe @nogc pure nothrow { return _s; }
9246         alias get this;
9247         @disable this(this);
9248         string _s;
9249     }
9250 
9251     static bool testAliasedString(alias func, Args...)(string s, Args args)
9252     {
9253         import std.algorithm.comparison : equal;
9254         auto a = func(TestAliasedString(s), args);
9255         auto b = func(s, args);
9256         static if (is(typeof(equal(a, b))))
9257         {
9258             // For ranges, compare contents instead of object identity.
9259             return equal(a, b);
9260         }
9261         else
9262         {
9263             return a == b;
9264         }
9265     }
9266     assert(testAliasedString!asLowerCase("hEllo"));
9267     assert(testAliasedString!asUpperCase("hEllo"));
9268     assert(testAliasedString!asCapitalized("hEllo"));
9269 }
9270 
9271 @safe unittest
9272 {
9273     import std.array : array;
9274 
9275     auto a = "HELLo".asLowerCase;
9276     auto savea = a.save;
9277     auto s = a.array;
9278     assert(s == "hello");
9279     s = savea.array;
9280     assert(s == "hello");
9281 
9282     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9283     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9284 
9285     foreach (i, slwr; lower)
9286     {
9287         import std.utf : byChar;
9288 
9289         auto sx = slwr.asUpperCase.byChar.array;
9290         assert(sx == toUpper(slwr));
9291         auto sy = upper[i].asLowerCase.byChar.array;
9292         assert(sy == toLower(upper[i]));
9293     }
9294 
9295     // Not necessary to call r.front
9296     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9297     {
9298     }
9299 
9300     import std.algorithm.comparison : equal;
9301 
9302     "HELLo"w.asLowerCase.equal("hello"d);
9303     "HELLo"w.asUpperCase.equal("HELLO"d);
9304     "HELLo"d.asLowerCase.equal("hello"d);
9305     "HELLo"d.asUpperCase.equal("HELLO"d);
9306 
9307     import std.utf : byChar;
9308     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9309 }
9310 
9311 // generic capitalizer on whole range, returns range
9312 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9313                            Range)(Range str)
9314     // Accept range of dchar's
9315 if (isInputRange!Range &&
9316     isSomeChar!(ElementEncodingType!Range) &&
9317     ElementEncodingType!Range.sizeof == dchar.sizeof)
9318 {
9319     static struct ToCapitalizerImpl
9320     {
9321         @property bool empty()
9322         {
9323             return lower ? lwr.empty : !nLeft && r.empty;
9324         }
9325 
9326         @property auto front()
9327         {
9328             if (lower)
9329                 return lwr.front;
9330 
9331             if (!nLeft)
9332             {
9333                 immutable dchar c = r.front;
9334                 const idx = indexFnUpper(c);
9335                 if (idx == ushort.max)
9336                 {
9337                     buf[0] = c;
9338                     nLeft = 1;
9339                 }
9340                 else if (idx < maxIdxUpper)
9341                 {
9342                     buf[0] = tableFnUpper(idx);
9343                     nLeft = 1;
9344                 }
9345                 else
9346                 {
9347                     immutable val = tableFnUpper(idx);
9348                     // unpack length + codepoint
9349                     nLeft = val >> 24;
9350                     if (nLeft == 0)
9351                         nLeft = 1;
9352                     assert(nLeft <= buf.length);
9353                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9354                     foreach (j; 1 .. nLeft)
9355                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9356                 }
9357             }
9358             return buf[nLeft - 1];
9359         }
9360 
9361         void popFront()
9362         {
9363             if (lower)
9364                 lwr.popFront();
9365             else
9366             {
9367                 if (!nLeft)
9368                     front;
9369                 assert(nLeft);
9370                 --nLeft;
9371                 if (!nLeft)
9372                 {
9373                     r.popFront();
9374                     lwr = r.asLowerCase();
9375                     lower = true;
9376                 }
9377             }
9378         }
9379 
9380         static if (isForwardRange!Range)
9381         {
9382             @property auto save()
9383             {
9384                 auto ret = this;
9385                 ret.r = r.save;
9386                 ret.lwr = lwr.save;
9387                 return ret;
9388             }
9389         }
9390 
9391       private:
9392         Range r;
9393         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9394         bool lower = false;     // false for first character, true for rest of string
9395         dchar[3] buf = void;
9396         uint nLeft = 0;
9397     }
9398 
9399     return ToCapitalizerImpl(str);
9400 }
9401 
9402 /*********************
9403  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9404  * or string, meaning convert the first
9405  * character to upper case and subsequent characters to lower case.
9406  *
9407  * Does not allocate memory.
9408  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9409  * are treated as $(REF replacementDchar, std,utf).
9410  *
9411  * Params:
9412  *      str = string or range of characters
9413  *
9414  * Returns:
9415  *      an InputRange of dchars
9416  *
9417  * See_Also:
9418  *      $(LREF toUpper), $(LREF toLower)
9419  *      $(LREF asUpperCase), $(LREF asLowerCase)
9420  */
9421 
9422 auto asCapitalized(Range)(Range str)
9423 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9424     !isConvertibleToString!Range)
9425 {
9426     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9427     {
9428         import std.utf : byDchar;
9429 
9430         // Decode first
9431         return toCapitalizer!UpperTriple(str.byDchar);
9432     }
9433     else
9434     {
9435         return toCapitalizer!UpperTriple(str);
9436     }
9437 }
9438 
9439 ///
9440 @safe pure unittest
9441 {
9442     import std.algorithm.comparison : equal;
9443 
9444     assert("hEllo".asCapitalized.equal("Hello"));
9445 }
9446 
9447 auto asCapitalized(Range)(auto ref Range str)
9448 if (isConvertibleToString!Range)
9449 {
9450     import std.traits : StringTypeOf;
9451     return asCapitalized!(StringTypeOf!Range)(str);
9452 }
9453 
9454 @safe pure nothrow @nogc unittest
9455 {
9456     auto r = "hEllo".asCapitalized();
9457     assert(r.front == 'H');
9458 }
9459 
9460 @safe unittest
9461 {
9462     import std.array : array;
9463 
9464     auto a = "hELLo".asCapitalized;
9465     auto savea = a.save;
9466     auto s = a.array;
9467     assert(s == "Hello");
9468     s = savea.array;
9469     assert(s == "Hello");
9470 
9471     string[2][] cases =
9472     [
9473         ["", ""],
9474         ["h", "H"],
9475         ["H", "H"],
9476         ["3", "3"],
9477         ["123", "123"],
9478         ["h123A", "H123a"],
9479         ["феж", "Феж"],
9480         ["\u1Fe2", "\u03a5\u0308\u0300"],
9481     ];
9482 
9483     foreach (i; 0 .. cases.length)
9484     {
9485         import std.utf : byChar;
9486 
9487         auto r = cases[i][0].asCapitalized.byChar.array;
9488         auto result = cases[i][1];
9489         assert(r == result);
9490     }
9491 
9492     // Don't call r.front
9493     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9494     {
9495     }
9496 
9497     import std.algorithm.comparison : equal;
9498 
9499     "HELLo"w.asCapitalized.equal("Hello"d);
9500     "hElLO"w.asCapitalized.equal("Hello"d);
9501     "hello"d.asCapitalized.equal("Hello"d);
9502     "HELLO"d.asCapitalized.equal("Hello"d);
9503 
9504     import std.utf : byChar;
9505     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9506 }
9507 
9508 // TODO: helper, I wish std.utf was more flexible (and stright)
9509 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9510 {
9511     if (c <= 0x7F)
9512     {
9513         buf[idx] = cast(char) c;
9514         idx++;
9515     }
9516     else if (c <= 0x7FF)
9517     {
9518         buf[idx] = cast(char)(0xC0 | (c >> 6));
9519         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9520         idx += 2;
9521     }
9522     else if (c <= 0xFFFF)
9523     {
9524         buf[idx] = cast(char)(0xE0 | (c >> 12));
9525         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9526         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9527         idx += 3;
9528     }
9529     else if (c <= 0x10FFFF)
9530     {
9531         buf[idx] = cast(char)(0xF0 | (c >> 18));
9532         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9533         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9534         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9535         idx += 4;
9536     }
9537     else
9538         assert(0);
9539     return idx;
9540 }
9541 
9542 @safe unittest
9543 {
9544     char[] s = "abcd".dup;
9545     size_t i = 0;
9546     i = encodeTo(s, i, 'X');
9547     assert(s == "Xbcd");
9548 
9549     i = encodeTo(s, i, cast(dchar)'\u00A9');
9550     assert(s == "X\xC2\xA9d");
9551 }
9552 
9553 // TODO: helper, I wish std.utf was more flexible (and stright)
9554 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9555 {
9556     import std.utf : UTFException;
9557     if (c <= 0xFFFF)
9558     {
9559         if (0xD800 <= c && c <= 0xDFFF)
9560             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9561         buf[idx] = cast(wchar) c;
9562         idx++;
9563     }
9564     else if (c <= 0x10FFFF)
9565     {
9566         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9567         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9568         idx += 2;
9569     }
9570     else
9571         assert(0);
9572     return idx;
9573 }
9574 
9575 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9576 {
9577     buf[idx] = c;
9578     idx++;
9579     return idx;
9580 }
9581 
9582 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9583 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9584 {
9585     import std.utf : decode, codeLength;
9586     size_t curIdx = 0;
9587     size_t destIdx = 0;
9588     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9589     size_t lastUnchanged = 0;
9590     // in-buffer move of bytes to a new start index
9591     // the trick is that it may not need to copy at all
9592     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9593     {
9594         // Interestingly we may just bump pointer for a while
9595         // then have to copy if a re-cased char was smaller the original
9596         // later we may regain pace with char that got bigger
9597         // In the end it sometimes flip-flops between the 2 cases below
9598         if (dest == from)
9599             return to;
9600         // got to copy
9601         foreach (C c; str[from .. to])
9602             str[dest++] = c;
9603         return dest;
9604     }
9605     while (curIdx != s.length)
9606     {
9607         size_t startIdx = curIdx;
9608         immutable ch = decode(s, curIdx);
9609         // TODO: special case for ASCII
9610         immutable caseIndex = indexFn(ch);
9611         if (caseIndex == ushort.max) // unchanged, skip over
9612         {
9613             continue;
9614         }
9615         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9616         {
9617             // previous cased chars had the same length as uncased ones
9618             // thus can just adjust pointer
9619             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9620             lastUnchanged = curIdx;
9621             immutable cased = tableFn(caseIndex);
9622             immutable casedLen = codeLength!C(cased);
9623             if (casedLen + destIdx > curIdx) // no place to fit cased char
9624             {
9625                 // switch to slow codepath, where we allocate
9626                 return slowToCase(s, startIdx, destIdx);
9627             }
9628             else
9629             {
9630                 destIdx = encodeTo(s, destIdx, cased);
9631             }
9632         }
9633         else  // 1:m codepoint mapping, slow codepath
9634         {
9635             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9636             lastUnchanged = curIdx;
9637             return slowToCase(s, startIdx, destIdx);
9638         }
9639         assert(destIdx <= curIdx);
9640     }
9641     if (lastUnchanged != s.length)
9642     {
9643         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9644     }
9645     s = s[0 .. destIdx];
9646 }
9647 
9648 // helper to precalculate size of case-converted string
9649 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9650 {
9651     size_t toCaseLength(C)(const scope C[] str)
9652     {
9653         import std.utf : decode, codeLength;
9654         size_t codeLen = 0;
9655         size_t lastNonTrivial = 0;
9656         size_t curIdx = 0;
9657         while (curIdx != str.length)
9658         {
9659             immutable startIdx = curIdx;
9660             immutable ch = decode(str, curIdx);
9661             immutable ushort caseIndex = indexFn(ch);
9662             if (caseIndex == ushort.max)
9663                 continue;
9664             else if (caseIndex < maxIdx)
9665             {
9666                 codeLen += startIdx - lastNonTrivial;
9667                 lastNonTrivial = curIdx;
9668                 immutable cased = tableFn(caseIndex);
9669                 codeLen += codeLength!C(cased);
9670             }
9671             else
9672             {
9673                 codeLen += startIdx - lastNonTrivial;
9674                 lastNonTrivial = curIdx;
9675                 immutable val = tableFn(caseIndex);
9676                 immutable len = val >> 24;
9677                 immutable dchar cased = val & 0xFF_FFFF;
9678                 codeLen += codeLength!C(cased);
9679                 foreach (j; caseIndex+1 .. caseIndex+len)
9680                     codeLen += codeLength!C(tableFn(j));
9681             }
9682         }
9683         if (lastNonTrivial != str.length)
9684             codeLen += str.length - lastNonTrivial;
9685         return codeLen;
9686     }
9687 }
9688 
9689 @safe unittest
9690 {
9691     alias toLowerLength = toCaseLength!(LowerTriple);
9692     assert(toLowerLength("abcd") == 4);
9693     assert(toLowerLength("аБВгд456") == 10+3);
9694 }
9695 
9696 // slower code path that preallocates and then copies
9697 // case-converted stuf to the new string
9698 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9699 {
9700     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9701         size_t destIdx) @trusted pure
9702         if (is(C == char) || is(C == wchar) || is(C == dchar))
9703     {
9704         import std.utf : decode;
9705         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9706         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9707         C[] ns = new C[trueLength];
9708         ns[0 .. destIdx] = s[0 .. destIdx];
9709         size_t lastUnchanged = curIdx;
9710         while (curIdx != s.length)
9711         {
9712             immutable startIdx = curIdx; // start of current codepoint
9713             immutable ch = decode(s, curIdx);
9714             immutable caseIndex = indexFn(ch);
9715             if (caseIndex == ushort.max) // skip over
9716             {
9717                 continue;
9718             }
9719             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9720             {
9721                 immutable cased = tableFn(caseIndex);
9722                 auto toCopy = startIdx - lastUnchanged;
9723                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9724                 lastUnchanged = curIdx;
9725                 destIdx += toCopy;
9726                 destIdx = encodeTo(ns, destIdx, cased);
9727             }
9728             else  // 1:m codepoint mapping, slow codepath
9729             {
9730                 auto toCopy = startIdx - lastUnchanged;
9731                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9732                 lastUnchanged = curIdx;
9733                 destIdx += toCopy;
9734                 auto val = tableFn(caseIndex);
9735                 // unpack length + codepoint
9736                 immutable uint len = val >> 24;
9737                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9738                 foreach (j; caseIndex+1 .. caseIndex+len)
9739                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9740             }
9741         }
9742         if (lastUnchanged != s.length)
9743         {
9744             auto toCopy = s.length - lastUnchanged;
9745             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9746             destIdx += toCopy;
9747         }
9748         assert(ns.length == destIdx);
9749         s = ns;
9750     }
9751 }
9752 
9753 /++
9754     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9755     For a few characters string length may increase after the transformation,
9756     in such a case the function reallocates exactly once.
9757     If `s` does not have any uppercase characters, then `s` is unaltered.
9758 +/
9759 void toLowerInPlace(C)(ref C[] s) @trusted pure
9760 if (is(C == char) || is(C == wchar) || is(C == dchar))
9761 {
9762     toCaseInPlace!(LowerTriple)(s);
9763 }
9764 // overloads for the most common cases to reduce compile time
9765 @safe pure /*TODO nothrow*/
9766 {
9767     void toLowerInPlace(ref char[] s)
9768     { toLowerInPlace!char(s); }
9769     void toLowerInPlace(ref wchar[] s)
9770     { toLowerInPlace!wchar(s); }
9771     void toLowerInPlace(ref dchar[] s)
9772     { toLowerInPlace!dchar(s); }
9773 }
9774 
9775 /++
9776     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9777     For a few characters string length may increase after the transformation,
9778     in such a case the function reallocates exactly once.
9779     If `s` does not have any lowercase characters, then `s` is unaltered.
9780 +/
9781 void toUpperInPlace(C)(ref C[] s) @trusted pure
9782 if (is(C == char) || is(C == wchar) || is(C == dchar))
9783 {
9784     toCaseInPlace!(UpperTriple)(s);
9785 }
9786 // overloads for the most common cases to reduce compile time/code size
9787 @safe pure /*TODO nothrow*/
9788 {
9789     void toUpperInPlace(ref char[] s)
9790     { toUpperInPlace!char(s); }
9791     void toUpperInPlace(ref wchar[] s)
9792     { toUpperInPlace!wchar(s); }
9793     void toUpperInPlace(ref dchar[] s)
9794     { toUpperInPlace!dchar(s); }
9795 }
9796 
9797 /++
9798     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9799     is returned. Otherwise `c` is returned.
9800 
9801     Warning: certain alphabets like German and Greek have no 1:1
9802     upper-lower mapping. Use overload of toLower which takes full string instead.
9803 +/
9804 @safe pure nothrow @nogc
9805 dchar toLower(dchar c)
9806 {
9807      // optimize ASCII case
9808     if (c < 0xAA)
9809     {
9810         if (c < 'A')
9811             return c;
9812         if (c <= 'Z')
9813             return c + 32;
9814         return c;
9815     }
9816     size_t idx = toLowerSimpleIndex(c);
9817     if (idx != ushort.max)
9818     {
9819         return toLowerTab(idx);
9820     }
9821     return c;
9822 }
9823 
9824 /++
9825     Creates a new array which is identical to `s` except that all of its
9826     characters are converted to lowercase (by preforming Unicode lowercase mapping).
9827     If none of `s` characters were affected, then `s` itself is returned if `s` is a
9828     `string`-like type.
9829 
9830     Params:
9831         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
9832         of characters
9833     Returns:
9834         An array with the same element type as `s`.
9835 +/
9836 ElementEncodingType!S[] toLower(S)(S s)
9837 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9838 {
9839     static import std.ascii;
9840 
9841     static if (isSomeString!S)
9842         return () @trusted { return toCase!(LowerTriple, std.ascii.toLower)(s); } ();
9843     else
9844         return toCase!(LowerTriple, std.ascii.toLower)(s);
9845 }
9846 
9847 // overloads for the most common cases to reduce compile time
9848 @safe pure /*TODO nothrow*/
9849 {
9850     string toLower(string s)
9851     { return toLower!string(s); }
9852     wstring toLower(wstring s)
9853     { return toLower!wstring(s); }
9854     dstring toLower(dstring s)
9855     { return toLower!dstring(s); }
9856 
9857     @safe unittest
9858     {
9859         // https://issues.dlang.org/show_bug.cgi?id=16663
9860 
9861         static struct String
9862         {
9863             string data;
9864             alias data this;
9865         }
9866 
9867         void foo()
9868         {
9869             auto u = toLower(String(""));
9870         }
9871     }
9872 }
9873 
9874 
9875 @safe unittest
9876 {
9877     static import std.ascii;
9878     import std.format : format;
9879     foreach (ch; 0 .. 0x80)
9880         assert(std.ascii.toLower(ch) == toLower(ch));
9881     assert(toLower('Я') == 'я');
9882     assert(toLower('Δ') == 'δ');
9883     foreach (ch; unicode.upperCase.byCodepoint)
9884     {
9885         dchar low = ch.toLower();
9886         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9887     }
9888     assert(toLower("АЯ") == "ая");
9889 
9890     assert("\u1E9E".toLower == "\u00df");
9891     assert("\u00df".toUpper == "SS");
9892 }
9893 
9894 // https://issues.dlang.org/show_bug.cgi?id=9629
9895 @safe unittest
9896 {
9897     wchar[] test = "hello þ world"w.dup;
9898     auto piece = test[6 .. 7];
9899     toUpperInPlace(piece);
9900     assert(test == "hello Þ world");
9901 }
9902 
9903 
9904 @safe unittest
9905 {
9906     import std.algorithm.comparison : cmp;
9907     string s1 = "FoL";
9908     string s2 = toLower(s1);
9909     assert(cmp(s2, "fol") == 0, s2);
9910     assert(s2 != s1);
9911 
9912     char[] s3 = s1.dup;
9913     toLowerInPlace(s3);
9914     assert(s3 == s2);
9915 
9916     s1 = "A\u0100B\u0101d";
9917     s2 = toLower(s1);
9918     s3 = s1.dup;
9919     assert(cmp(s2, "a\u0101b\u0101d") == 0);
9920     assert(s2 !is s1);
9921     toLowerInPlace(s3);
9922     assert(s3 == s2);
9923 
9924     s1 = "A\u0460B\u0461d";
9925     s2 = toLower(s1);
9926     s3 = s1.dup;
9927     assert(cmp(s2, "a\u0461b\u0461d") == 0);
9928     assert(s2 !is s1);
9929     toLowerInPlace(s3);
9930     assert(s3 == s2);
9931 
9932     s1 = "\u0130";
9933     s2 = toLower(s1);
9934     s3 = s1.dup;
9935     assert(s2 == "i\u0307");
9936     assert(s2 !is s1);
9937     toLowerInPlace(s3);
9938     assert(s3 == s2);
9939 
9940     // Test on wchar and dchar strings.
9941     assert(toLower("Some String"w) == "some string"w);
9942     assert(toLower("Some String"d) == "some string"d);
9943 
9944     // https://issues.dlang.org/show_bug.cgi?id=12455
9945     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9946     assert(isUpper(c));
9947     assert(toLower(c) == 'i');
9948     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
9949     // check simple-case toUpper too
9950     c = '\u1f87';
9951     assert(isLower(c));
9952     assert(toUpper(c) == '\u1F8F');
9953 }
9954 
9955 @safe pure unittest
9956 {
9957     import std.algorithm.comparison : cmp, equal;
9958     import std.utf : byCodeUnit;
9959     auto r1 = "FoL".byCodeUnit;
9960     assert(r1.toLower.cmp("fol") == 0);
9961     auto r2 = "A\u0460B\u0461d".byCodeUnit;
9962     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
9963 }
9964 
9965 /++
9966     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9967     is returned. Otherwise `c` is returned.
9968 
9969     Warning:
9970     Certain alphabets like German and Greek have no 1:1
9971     upper-lower mapping. Use overload of toUpper which takes full string instead.
9972 
9973     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9974     to produce an algorithm that can convert a range of characters to upper case
9975     without allocating memory.
9976     A string can then be produced by using $(REF copy, std,algorithm,mutation)
9977     to send it to an $(REF appender, std,array).
9978 +/
9979 @safe pure nothrow @nogc
9980 dchar toUpper(dchar c)
9981 {
9982     // optimize ASCII case
9983     if (c < 0xAA)
9984     {
9985         if (c < 'a')
9986             return c;
9987         if (c <= 'z')
9988             return c - 32;
9989         return c;
9990     }
9991     size_t idx = toUpperSimpleIndex(c);
9992     if (idx != ushort.max)
9993     {
9994         return toUpperTab(idx);
9995     }
9996     return c;
9997 }
9998 
9999 ///
10000 @safe unittest
10001 {
10002     import std.algorithm.iteration : map;
10003     import std.algorithm.mutation : copy;
10004     import std.array : appender;
10005 
10006     auto abuf = appender!(char[])();
10007     "hello".map!toUpper.copy(abuf);
10008     assert(abuf.data == "HELLO");
10009 }
10010 
10011 @safe unittest
10012 {
10013     static import std.ascii;
10014     import std.format : format;
10015     foreach (ch; 0 .. 0x80)
10016         assert(std.ascii.toUpper(ch) == toUpper(ch));
10017     assert(toUpper('я') == 'Я');
10018     assert(toUpper('δ') == 'Δ');
10019     auto title = unicode.Titlecase_Letter;
10020     foreach (ch; unicode.lowerCase.byCodepoint)
10021     {
10022         dchar up = ch.toUpper();
10023         assert(up == ch || isUpper(up) || title[up],
10024             format("%x -> %x", ch, up));
10025     }
10026 }
10027 
10028 /++
10029     Allocates a new array which is identical to `s` except that all of its
10030     characters are converted to uppercase (by preforming Unicode uppercase mapping).
10031     If none of `s` characters were affected, then `s` itself is returned if `s`
10032     is a `string`-like type.
10033 
10034     Params:
10035         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10036         of characters
10037     Returns:
10038         An new array with the same element type as `s`.
10039 +/
10040 ElementEncodingType!S[] toUpper(S)(S s)
10041 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10042 {
10043     static import std.ascii;
10044 
10045     static if (isSomeString!S)
10046         return () @trusted { return toCase!(UpperTriple, std.ascii.toUpper)(s); } ();
10047     else
10048         return toCase!(UpperTriple, std.ascii.toUpper)(s);
10049 }
10050 
10051 // overloads for the most common cases to reduce compile time
10052 @safe pure /*TODO nothrow*/
10053 {
10054     string toUpper(string s)
10055     { return toUpper!string(s); }
10056     wstring toUpper(wstring s)
10057     { return toUpper!wstring(s); }
10058     dstring toUpper(dstring s)
10059     { return toUpper!dstring(s); }
10060 
10061     @safe unittest
10062     {
10063         // https://issues.dlang.org/show_bug.cgi?id=16663
10064 
10065         static struct String
10066         {
10067             string data;
10068             alias data this;
10069         }
10070 
10071         void foo()
10072         {
10073             auto u = toUpper(String(""));
10074         }
10075     }
10076 }
10077 
10078 @safe unittest
10079 {
10080     import std.algorithm.comparison : cmp;
10081 
10082     string s1 = "FoL";
10083     string s2;
10084     char[] s3;
10085 
10086     s2 = toUpper(s1);
10087     s3 = s1.dup; toUpperInPlace(s3);
10088     assert(s3 == s2, s3);
10089     assert(cmp(s2, "FOL") == 0);
10090     assert(s2 !is s1);
10091 
10092     s1 = "a\u0100B\u0101d";
10093     s2 = toUpper(s1);
10094     s3 = s1.dup; toUpperInPlace(s3);
10095     assert(s3 == s2);
10096     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10097     assert(s2 !is s1);
10098 
10099     s1 = "a\u0460B\u0461d";
10100     s2 = toUpper(s1);
10101     s3 = s1.dup; toUpperInPlace(s3);
10102     assert(s3 == s2);
10103     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10104     assert(s2 !is s1);
10105 }
10106 
10107 @system unittest
10108 {
10109     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10110     {
10111         import std.format : format;
10112         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10113         auto low = s.toLower() , up = s.toUpper();
10114         auto lowInp = s.dup, upInp = s.dup;
10115         lowInp.toLowerInPlace();
10116         upInp.toUpperInPlace();
10117         assert(low == trueLow, format(diff, low, trueLow));
10118         assert(up == trueUp,  format(diff, up, trueUp));
10119         assert(lowInp == trueLow,
10120             format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow));
10121         assert(upInp == trueUp,
10122             format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp));
10123     }
10124     static foreach (S; AliasSeq!(dstring, wstring, string))
10125     {{
10126 
10127         S easy = "123";
10128         S good = "abCФеж";
10129         S awful = "\u0131\u023f\u2126";
10130         S wicked = "\u0130\u1FE2";
10131         auto options = [easy, good, awful, wicked];
10132         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10133         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10134 
10135         foreach (val; [easy, good])
10136         {
10137             auto e = val.dup;
10138             auto g = e;
10139             e.toUpperInPlace();
10140             assert(e is g);
10141             e.toLowerInPlace();
10142             assert(e is g);
10143         }
10144         foreach (i, v; options)
10145         {
10146             doTest(v, upper[i], lower[i]);
10147         }
10148 
10149         // a few combinatorial runs
10150         foreach (i; 0 .. options.length)
10151         foreach (j; i .. options.length)
10152         foreach (k; j .. options.length)
10153         {
10154             auto sample = options[i] ~ options[j] ~ options[k];
10155             auto sample2 = options[k] ~ options[j] ~ options[i];
10156             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10157                 lower[i] ~ lower[j] ~ lower[k]);
10158             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10159                 lower[k] ~ lower[j] ~ lower[i]);
10160         }
10161     }}
10162 }
10163 
10164 // test random access ranges
10165 @safe pure unittest
10166 {
10167     import std.algorithm.comparison : cmp;
10168     import std.utf : byCodeUnit;
10169     auto s1 = "FoL".byCodeUnit;
10170     assert(s1.toUpper.cmp("FOL") == 0);
10171     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10172     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10173 }
10174 
10175 /++
10176     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10177     (general Unicode category: Alphabetic).
10178 +/
10179 @safe pure nothrow @nogc
10180 bool isAlpha(dchar c)
10181 {
10182     // optimization
10183     if (c < 0xAA)
10184     {
10185         size_t x = c - 'A';
10186         if (x <= 'Z' - 'A')
10187             return true;
10188         else
10189         {
10190             x = c - 'a';
10191             if (x <= 'z'-'a')
10192                 return true;
10193         }
10194         return false;
10195     }
10196 
10197     return alphaTrie[c];
10198 }
10199 
10200 @safe unittest
10201 {
10202     auto alpha = unicode("Alphabetic");
10203     foreach (ch; alpha.byCodepoint)
10204         assert(isAlpha(ch));
10205     foreach (ch; 0 .. 0x4000)
10206         assert((ch in alpha) == isAlpha(ch));
10207 }
10208 
10209 
10210 /++
10211     Returns whether `c` is a Unicode mark
10212     (general Unicode category: Mn, Me, Mc).
10213 +/
10214 @safe pure nothrow @nogc
10215 bool isMark(dchar c)
10216 {
10217     return markTrie[c];
10218 }
10219 
10220 @safe unittest
10221 {
10222     auto mark = unicode("Mark");
10223     foreach (ch; mark.byCodepoint)
10224         assert(isMark(ch));
10225     foreach (ch; 0 .. 0x4000)
10226         assert((ch in mark) == isMark(ch));
10227 }
10228 
10229 /++
10230     Returns whether `c` is a Unicode numerical $(CHARACTER)
10231     (general Unicode category: Nd, Nl, No).
10232 +/
10233 @safe pure nothrow @nogc
10234 bool isNumber(dchar c)
10235 {
10236     // optimization for ascii case
10237     if (c <= 0x7F)
10238     {
10239         return c >= '0' && c <= '9';
10240     }
10241     else
10242     {
10243         return numberTrie[c];
10244     }
10245 }
10246 
10247 @safe unittest
10248 {
10249     auto n = unicode("N");
10250     foreach (ch; n.byCodepoint)
10251         assert(isNumber(ch));
10252     foreach (ch; 0 .. 0x4000)
10253         assert((ch in n) == isNumber(ch));
10254 }
10255 
10256 /++
10257     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10258     (general Unicode category: Alphabetic, Nd, Nl, No).
10259 
10260     Params:
10261         c = any Unicode character
10262     Returns:
10263         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10264         categories
10265 +/
10266 @safe pure nothrow @nogc
10267 bool isAlphaNum(dchar c)
10268 {
10269     static import std.ascii;
10270 
10271     // optimization for ascii case
10272     if (std.ascii.isASCII(c))
10273     {
10274         return std.ascii.isAlphaNum(c);
10275     }
10276     else
10277     {
10278         return isAlpha(c) || isNumber(c);
10279     }
10280 }
10281 
10282 @safe unittest
10283 {
10284     auto n = unicode("N");
10285     auto alpha = unicode("Alphabetic");
10286 
10287     foreach (ch; n.byCodepoint)
10288         assert(isAlphaNum(ch));
10289 
10290     foreach (ch; alpha.byCodepoint)
10291         assert(isAlphaNum(ch));
10292 
10293     foreach (ch; 0 .. 0x4000)
10294     {
10295         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10296     }
10297 }
10298 
10299 /++
10300     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10301     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10302 +/
10303 @safe pure nothrow @nogc
10304 bool isPunctuation(dchar c)
10305 {
10306     static import std.ascii;
10307 
10308     // optimization for ascii case
10309     if (c <= 0x7F)
10310     {
10311         return std.ascii.isPunctuation(c);
10312     }
10313     else
10314     {
10315         return punctuationTrie[c];
10316     }
10317 }
10318 
10319 @safe unittest
10320 {
10321     assert(isPunctuation('\u0021'));
10322     assert(isPunctuation('\u0028'));
10323     assert(isPunctuation('\u0029'));
10324     assert(isPunctuation('\u002D'));
10325     assert(isPunctuation('\u005F'));
10326     assert(isPunctuation('\u00AB'));
10327     assert(isPunctuation('\u00BB'));
10328     foreach (ch; unicode("P").byCodepoint)
10329         assert(isPunctuation(ch));
10330 }
10331 
10332 /++
10333     Returns whether `c` is a Unicode symbol $(CHARACTER)
10334     (general Unicode category: Sm, Sc, Sk, So).
10335 +/
10336 @safe pure nothrow @nogc
10337 bool isSymbol(dchar c)
10338 {
10339    return symbolTrie[c];
10340 }
10341 
10342 @safe unittest
10343 {
10344     import std.format : format;
10345     assert(isSymbol('\u0024'));
10346     assert(isSymbol('\u002B'));
10347     assert(isSymbol('\u005E'));
10348     assert(isSymbol('\u00A6'));
10349     foreach (ch; unicode("S").byCodepoint)
10350         assert(isSymbol(ch), format("%04x", ch));
10351 }
10352 
10353 /++
10354     Returns whether `c` is a Unicode space $(CHARACTER)
10355     (general Unicode category: Zs)
10356     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10357     For commonly used less strict semantics see $(LREF isWhite).
10358 +/
10359 @safe pure nothrow @nogc
10360 bool isSpace(dchar c)
10361 {
10362     import std.internal.unicode_tables : isSpaceGen; // generated file
10363     return isSpaceGen(c);
10364 }
10365 
10366 @safe unittest
10367 {
10368     assert(isSpace('\u0020'));
10369     auto space = unicode.Zs;
10370     foreach (ch; space.byCodepoint)
10371         assert(isSpace(ch));
10372     foreach (ch; 0 .. 0x1000)
10373         assert(isSpace(ch) == space[ch]);
10374 }
10375 
10376 
10377 /++
10378     Returns whether `c` is a Unicode graphical $(CHARACTER)
10379     (general Unicode category: L, M, N, P, S, Zs).
10380 
10381 +/
10382 @safe pure nothrow @nogc
10383 bool isGraphical(dchar c)
10384 {
10385     return graphicalTrie[c];
10386 }
10387 
10388 
10389 @safe unittest
10390 {
10391     auto set = unicode("Graphical");
10392     import std.format : format;
10393     foreach (ch; set.byCodepoint)
10394         assert(isGraphical(ch), format("%4x", ch));
10395     foreach (ch; 0 .. 0x4000)
10396         assert((ch in set) == isGraphical(ch));
10397 }
10398 
10399 
10400 /++
10401     Returns whether `c` is a Unicode control $(CHARACTER)
10402     (general Unicode category: Cc).
10403 +/
10404 @safe pure nothrow @nogc
10405 bool isControl(dchar c)
10406 {
10407     import std.internal.unicode_tables : isControlGen; // generated file
10408     return isControlGen(c);
10409 }
10410 
10411 @safe unittest
10412 {
10413     assert(isControl('\u0000'));
10414     assert(isControl('\u0081'));
10415     assert(!isControl('\u0100'));
10416     auto cc = unicode.Cc;
10417     foreach (ch; cc.byCodepoint)
10418         assert(isControl(ch));
10419     foreach (ch; 0 .. 0x1000)
10420         assert(isControl(ch) == cc[ch]);
10421 }
10422 
10423 
10424 /++
10425     Returns whether `c` is a Unicode formatting $(CHARACTER)
10426     (general Unicode category: Cf).
10427 +/
10428 @safe pure nothrow @nogc
10429 bool isFormat(dchar c)
10430 {
10431     import std.internal.unicode_tables : isFormatGen; // generated file
10432     return isFormatGen(c);
10433 }
10434 
10435 
10436 @safe unittest
10437 {
10438     assert(isFormat('\u00AD'));
10439     foreach (ch; unicode("Format").byCodepoint)
10440         assert(isFormat(ch));
10441 }
10442 
10443 // code points for private use, surrogates are not likely to change in near feature
10444 // if need be they can be generated from unicode data as well
10445 
10446 /++
10447     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10448     (general Unicode category: Co).
10449 +/
10450 @safe pure nothrow @nogc
10451 bool isPrivateUse(dchar c)
10452 {
10453     return (0x00_E000 <= c && c <= 0x00_F8FF)
10454         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10455         || (0x10_0000 <= c && c <= 0x10_FFFD);
10456 }
10457 
10458 /++
10459     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10460     (general Unicode category: Cs).
10461 +/
10462 @safe pure nothrow @nogc
10463 bool isSurrogate(dchar c)
10464 {
10465     return (0xD800 <= c && c <= 0xDFFF);
10466 }
10467 
10468 /++
10469     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10470 +/
10471 @safe pure nothrow @nogc
10472 bool isSurrogateHi(dchar c)
10473 {
10474     return (0xD800 <= c && c <= 0xDBFF);
10475 }
10476 
10477 /++
10478     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10479 +/
10480 @safe pure nothrow @nogc
10481 bool isSurrogateLo(dchar c)
10482 {
10483     return (0xDC00 <= c && c <= 0xDFFF);
10484 }
10485 
10486 /++
10487     Returns whether `c` is a Unicode non-character i.e.
10488     a $(CODEPOINT) with no assigned abstract character.
10489     (general Unicode category: Cn)
10490 +/
10491 @safe pure nothrow @nogc
10492 bool isNonCharacter(dchar c)
10493 {
10494     return nonCharacterTrie[c];
10495 }
10496 
10497 @safe unittest
10498 {
10499     auto set = unicode("Cn");
10500     foreach (ch; set.byCodepoint)
10501         assert(isNonCharacter(ch));
10502 }
10503 
10504 private:
10505 // load static data from pre-generated tables into usable datastructures
10506 
10507 
10508 @safe auto asSet(const (ubyte)[] compressed) pure
10509 {
10510     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10511 }
10512 
10513 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10514 {
10515     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10516 }
10517 
10518 @safe pure nothrow @nogc @property
10519 {
10520     import std.internal.unicode_tables; // generated file
10521 
10522     // It's important to use auto return here, so that the compiler
10523     // only runs semantic on the return type if the function gets
10524     // used. Also these are functions rather than templates to not
10525     // increase the object size of the caller.
10526     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10527     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10528     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10529     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10530     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10531     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10532     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10533     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10534     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10535     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10536     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10537 
10538     //normalization quick-check tables
10539     auto nfcQCTrie()
10540     {
10541         import std.internal.unicode_norm : nfcQCTrieEntries;
10542         static immutable res = asTrie(nfcQCTrieEntries);
10543         return res;
10544     }
10545 
10546     auto nfdQCTrie()
10547     {
10548         import std.internal.unicode_norm : nfdQCTrieEntries;
10549         static immutable res = asTrie(nfdQCTrieEntries);
10550         return res;
10551     }
10552 
10553     auto nfkcQCTrie()
10554     {
10555         import std.internal.unicode_norm : nfkcQCTrieEntries;
10556         static immutable res = asTrie(nfkcQCTrieEntries);
10557         return res;
10558     }
10559 
10560     auto nfkdQCTrie()
10561     {
10562         import std.internal.unicode_norm : nfkdQCTrieEntries;
10563         static immutable res = asTrie(nfkdQCTrieEntries);
10564         return res;
10565     }
10566 
10567     //grapheme breaking algorithm tables
10568     auto mcTrie()
10569     {
10570         import std.internal.unicode_grapheme : mcTrieEntries;
10571         static immutable res = asTrie(mcTrieEntries);
10572         return res;
10573     }
10574 
10575     auto graphemeExtendTrie()
10576     {
10577         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10578         static immutable res = asTrie(graphemeExtendTrieEntries);
10579         return res;
10580     }
10581 
10582     auto hangLV()
10583     {
10584         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10585         static immutable res = asTrie(hangulLVTrieEntries);
10586         return res;
10587     }
10588 
10589     auto hangLVT()
10590     {
10591         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10592         static immutable res = asTrie(hangulLVTTrieEntries);
10593         return res;
10594     }
10595 
10596     // tables below are used for composition/decomposition
10597     auto combiningClassTrie()
10598     {
10599         import std.internal.unicode_comp : combiningClassTrieEntries;
10600         static immutable res = asTrie(combiningClassTrieEntries);
10601         return res;
10602     }
10603 
10604     auto compatMappingTrie()
10605     {
10606         import std.internal.unicode_decomp : compatMappingTrieEntries;
10607         static immutable res = asTrie(compatMappingTrieEntries);
10608         return res;
10609     }
10610 
10611     auto canonMappingTrie()
10612     {
10613         import std.internal.unicode_decomp : canonMappingTrieEntries;
10614         static immutable res = asTrie(canonMappingTrieEntries);
10615         return res;
10616     }
10617 
10618     auto compositionJumpTrie()
10619     {
10620         import std.internal.unicode_comp : compositionJumpTrieEntries;
10621         static immutable res = asTrie(compositionJumpTrieEntries);
10622         return res;
10623     }
10624 
10625     //case conversion tables
10626     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10627     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10628     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10629     //simple case conversion tables
10630     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10631     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10632     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10633 
10634 }
10635 
10636 }// version (!std_uni_bootstrap)