1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 )) 20 $(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23 )) 24 $(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43 )) 44 $(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50 )) 51 $(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55 )) 56 $(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59 )) 60 $(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65 )) 66 $(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73 )) 74 $(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84 )) 85 $(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89 )) 90 $(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94 )) 95 $(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99 )) 100 )) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['¥']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['£']); 219 assert(twoTrie['£']); 220 assert(threeTrie['£']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 226 assert(balance == "ընկեր!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "Äffin"); 237 assert(nS != nonS); 238 string composed = "Äffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2¹⁰") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a “character” 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694 Macros: 695 696 SECTION = <h3><a id="$1">$0</a></h3> 697 DEF = <div><a id="$1"><i>$0</i></a></div> 698 S_LINK = <a href="#$1">$+</a> 699 CODEPOINT = $(S_LINK Code point, code point) 700 CODEPOINTS = $(S_LINK Code point, code points) 701 CHARACTER = $(S_LINK Character, character) 702 CHARACTERS = $(S_LINK Character, characters) 703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704 +/ 705 module std.uni; 706 707 import std.meta : AliasSeq; 708 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711 import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713 // debug = std_uni; 714 715 debug(std_uni) import std.stdio; // writefln, writeln 716 717 private: 718 719 720 void copyBackwards(T,U)(T[] src, U[] dest) 721 { 722 assert(src.length == dest.length); 723 for (size_t i=src.length; i-- > 0; ) 724 dest[i] = src[i]; 725 } 726 727 void copyForward(T,U)(T[] src, U[] dest) 728 { 729 assert(src.length == dest.length); 730 for (size_t i=0; i<src.length; i++) 731 dest[i] = src[i]; 732 } 733 734 // TODO: update to reflect all major CPUs supporting unaligned reads 735 version (X86) 736 enum hasUnalignedReads = true; 737 else version (X86_64) 738 enum hasUnalignedReads = true; 739 else version (SystemZ) 740 enum hasUnalignedReads = true; 741 else 742 enum hasUnalignedReads = false; // better be safe then sorry 743 744 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 745 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 746 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 747 748 // test the intro example 749 @safe unittest 750 { 751 import std.algorithm.searching : find; 752 // initialize code point sets using script/block or property name 753 // set contains code points from both scripts. 754 auto set = unicode("Cyrillic") | unicode("Armenian"); 755 // or simpler and statically-checked look 756 auto ascii = unicode.ASCII; 757 auto currency = unicode.Currency_Symbol; 758 759 // easy set ops 760 auto a = set & ascii; 761 assert(a.empty); // as it has no intersection with ascii 762 a = set | ascii; 763 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 764 765 // some properties of code point sets 766 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 767 // testing presence of a code point in a set 768 // is just fine, it is O(logN) 769 assert(!b['$']); 770 assert(!b['\u058F']); // Armenian dram sign 771 assert(b['¥']); 772 773 // building fast lookup tables, these guarantee O(1) complexity 774 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 775 auto oneTrie = toTrie!1(b); 776 // 2-level far more compact but typically slightly slower 777 auto twoTrie = toTrie!2(b); 778 // 3-level even smaller, and a bit slower yet 779 auto threeTrie = toTrie!3(b); 780 assert(oneTrie['£']); 781 assert(twoTrie['£']); 782 assert(threeTrie['£']); 783 784 // build the trie with the most sensible trie level 785 // and bind it as a functor 786 auto cyrillicOrArmenian = toDelegate(set); 787 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 788 assert(balance == "ընկեր!"); 789 // compatible with bool delegate(dchar) 790 bool delegate(dchar) bindIt = cyrillicOrArmenian; 791 792 // Normalization 793 string s = "Plain ascii (and not only), is always normalized!"; 794 assert(s is normalize(s));// is the same string 795 796 string nonS = "A\u0308ffin"; // A ligature 797 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 798 assert(nS == "Äffin"); 799 assert(nS != nonS); 800 string composed = "Äffin"; 801 802 assert(normalize!NFD(composed) == "A\u0308ffin"); 803 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 804 assert(normalize!NFKD("2¹⁰") == "210"); 805 } 806 807 enum lastDchar = 0x10FFFF; 808 809 auto force(T, F)(F from) 810 if (isIntegral!T && !is(T == F)) 811 { 812 assert(from <= T.max && from >= T.min); 813 return cast(T) from; 814 } 815 816 auto force(T, F)(F from) 817 if (isBitPacked!T && !is(T == F)) 818 { 819 assert(from <= 2^^bitSizeOf!T-1); 820 return T(cast(TypeOfBitPacked!T) from); 821 } 822 823 auto force(T, F)(F from) 824 if (is(T == F)) 825 { 826 return from; 827 } 828 829 // repeat X times the bit-pattern in val assuming it's length is 'bits' 830 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 831 { 832 static if (times == 1) 833 return val; 834 else static if (bits == 1) 835 { 836 static if (times == size_t.sizeof*8) 837 return val ? size_t.max : 0; 838 else 839 return val ? (1 << times)-1 : 0; 840 } 841 else static if (times % 2) 842 return (replicateBits!(times-1, bits)(val)<<bits) | val; 843 else 844 return replicateBits!(times/2, bits*2)((val << bits) | val); 845 } 846 847 @safe pure nothrow @nogc unittest // for replicate 848 { 849 import std.algorithm.iteration : sum, map; 850 import std.range : iota; 851 size_t m = 0b111; 852 size_t m2 = 0b01; 853 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 854 { 855 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 856 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 857 } 858 } 859 860 // multiple arrays squashed into one memory block 861 struct MultiArray(Types...) 862 { 863 import std.range.primitives : isOutputRange; 864 this(size_t[] sizes...) @safe pure nothrow 865 { 866 assert(dim == sizes.length); 867 size_t full_size; 868 foreach (i, v; Types) 869 { 870 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 871 sz[i] = sizes[i]; 872 static if (i >= 1) 873 offsets[i] = offsets[i-1] + 874 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 875 } 876 877 storage = new size_t[full_size]; 878 } 879 880 this(const(size_t)[] raw_offsets, 881 const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc 882 { 883 offsets[] = raw_offsets[]; 884 sz[] = raw_sizes[]; 885 storage = data; 886 } 887 888 @property auto slice(size_t n)()inout pure nothrow @nogc 889 { 890 auto ptr = raw_ptr!n; 891 return packedArrayView!(Types[n])(ptr, sz[n]); 892 } 893 894 @property auto ptr(size_t n)()inout pure nothrow @nogc 895 { 896 auto ptr = raw_ptr!n; 897 return inout(PackedPtr!(Types[n]))(ptr); 898 } 899 900 template length(size_t n) 901 { 902 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 903 904 @property void length(size_t new_size) 905 { 906 if (new_size > sz[n]) 907 {// extend 908 size_t delta = (new_size - sz[n]); 909 sz[n] += delta; 910 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 911 storage.length += delta;// extend space at end 912 // raw_slice!x must follow resize as it could be moved! 913 // next stmts move all data past this array, last-one-goes-first 914 static if (n != dim-1) 915 { 916 auto start = raw_ptr!(n+1); 917 // len includes delta 918 size_t len = (storage.ptr+storage.length-start); 919 920 copyBackwards(start[0 .. len-delta], start[delta .. len]); 921 922 start[0 .. delta] = 0; 923 // offsets are used for raw_slice, ptr etc. 924 foreach (i; n+1 .. dim) 925 offsets[i] += delta; 926 } 927 } 928 else if (new_size < sz[n]) 929 {// shrink 930 size_t delta = (sz[n] - new_size); 931 sz[n] -= delta; 932 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 933 // move all data past this array, forward direction 934 static if (n != dim-1) 935 { 936 auto start = raw_ptr!(n+1); 937 size_t len = (storage.ptr+storage.length-start); 938 copyForward(start[0 .. len-delta], start[delta .. len]); 939 940 // adjust offsets last, they affect raw_slice 941 foreach (i; n+1 .. dim) 942 offsets[i] -= delta; 943 } 944 storage.length -= delta; 945 } 946 // else - NOP 947 } 948 } 949 950 @property size_t bytes(size_t n=size_t.max)() const @safe 951 { 952 static if (n == size_t.max) 953 return storage.length*size_t.sizeof; 954 else static if (n != Types.length-1) 955 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 956 else 957 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 958 } 959 960 void store(OutRange)(scope OutRange sink) const 961 if (isOutputRange!(OutRange, char)) 962 { 963 import std.format : formattedWrite; 964 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 965 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 966 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 967 } 968 969 private: 970 import std.meta : staticMap; 971 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 972 { 973 static if (n == 0) 974 return storage.ptr; 975 else 976 { 977 return storage.ptr+offsets[n]; 978 } 979 } 980 enum dim = Types.length; 981 size_t[dim] offsets;// offset for level x 982 size_t[dim] sz;// size of level x 983 alias bitWidth = staticMap!(bitSizeOf, Types); 984 size_t[] storage; 985 } 986 987 @system unittest 988 { 989 import std.conv : text; 990 enum dg = (){ 991 // sizes are: 992 // lvl0: 3, lvl1 : 2, lvl2: 1 993 auto m = MultiArray!(int, ubyte, int)(3,2,1); 994 995 static void check(size_t k, T)(ref T m, int n) 996 { 997 foreach (i; 0 .. n) 998 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 999 } 1000 1001 static void checkB(size_t k, T)(ref T m, int n) 1002 { 1003 foreach (i; 0 .. n) 1004 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1005 } 1006 1007 static void fill(size_t k, T)(ref T m, int n) 1008 { 1009 foreach (i; 0 .. n) 1010 m.slice!(k)[i] = force!ubyte(i+1); 1011 } 1012 1013 static void fillB(size_t k, T)(ref T m, int n) 1014 { 1015 foreach (i; 0 .. n) 1016 m.slice!(k)[i] = force!ubyte(n-i); 1017 } 1018 1019 m.length!1 = 100; 1020 fill!1(m, 100); 1021 check!1(m, 100); 1022 1023 m.length!0 = 220; 1024 fill!0(m, 220); 1025 check!1(m, 100); 1026 check!0(m, 220); 1027 1028 m.length!2 = 17; 1029 fillB!2(m, 17); 1030 checkB!2(m, 17); 1031 check!0(m, 220); 1032 check!1(m, 100); 1033 1034 m.length!2 = 33; 1035 checkB!2(m, 17); 1036 fillB!2(m, 33); 1037 checkB!2(m, 33); 1038 check!0(m, 220); 1039 check!1(m, 100); 1040 1041 m.length!1 = 195; 1042 fillB!1(m, 195); 1043 checkB!1(m, 195); 1044 checkB!2(m, 33); 1045 check!0(m, 220); 1046 1047 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1048 marr.length!0 = 15; 1049 marr.length!1 = 30; 1050 fill!1(marr, 30); 1051 fill!0(marr, 15); 1052 check!1(marr, 30); 1053 check!0(marr, 15); 1054 return 0; 1055 }; 1056 enum ct = dg(); 1057 auto rt = dg(); 1058 } 1059 1060 @system unittest 1061 {// more bitpacking tests 1062 import std.conv : text; 1063 1064 alias Bitty = 1065 MultiArray!(BitPacked!(size_t, 3) 1066 , BitPacked!(size_t, 4) 1067 , BitPacked!(size_t, 3) 1068 , BitPacked!(size_t, 6) 1069 , bool); 1070 alias fn1 = sliceBits!(13, 16); 1071 alias fn2 = sliceBits!( 9, 13); 1072 alias fn3 = sliceBits!( 6, 9); 1073 alias fn4 = sliceBits!( 0, 6); 1074 static void check(size_t lvl, MA)(ref MA arr){ 1075 for (size_t i = 0; i< arr.length!lvl; i++) 1076 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1077 } 1078 1079 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1080 for (size_t i = 0; i< arr.length!lvl; i++) 1081 arr.slice!(lvl)[i] = i; 1082 } 1083 Bitty m1; 1084 1085 m1.length!4 = 10; 1086 m1.length!3 = 2^^6; 1087 m1.length!2 = 2^^3; 1088 m1.length!1 = 2^^4; 1089 m1.length!0 = 2^^3; 1090 1091 m1.length!4 = 2^^16; 1092 1093 for (size_t i = 0; i< m1.length!4; i++) 1094 m1.slice!(4)[i] = i % 2; 1095 1096 fillIdx!1(m1); 1097 check!1(m1); 1098 fillIdx!2(m1); 1099 check!2(m1); 1100 fillIdx!3(m1); 1101 check!3(m1); 1102 fillIdx!0(m1); 1103 check!0(m1); 1104 check!3(m1); 1105 check!2(m1); 1106 check!1(m1); 1107 for (size_t i=0; i < 2^^16; i++) 1108 { 1109 m1.slice!(4)[i] = i % 2; 1110 m1.slice!(0)[fn1(i)] = fn1(i); 1111 m1.slice!(1)[fn2(i)] = fn2(i); 1112 m1.slice!(2)[fn3(i)] = fn3(i); 1113 m1.slice!(3)[fn4(i)] = fn4(i); 1114 } 1115 for (size_t i=0; i < 2^^16; i++) 1116 { 1117 assert(m1.slice!(4)[i] == i % 2); 1118 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1119 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1120 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1121 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1122 } 1123 } 1124 1125 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1126 { 1127 import std.math : nextPow2; 1128 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1129 static if (bits > 8*size_t.sizeof) 1130 { 1131 static assert(bits % (size_t.sizeof*8) == 0); 1132 return new_len * bits/(8*size_t.sizeof); 1133 } 1134 else 1135 { 1136 enum factor = size_t.sizeof*8/bits; 1137 return (new_len+factor-1)/factor; // rounded up 1138 } 1139 } 1140 1141 template isBitPackableType(T) 1142 { 1143 enum isBitPackableType = isBitPacked!T 1144 || isIntegral!T || is(T == bool) || isSomeChar!T; 1145 } 1146 1147 //============================================================================ 1148 template PackedArrayView(T) 1149 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1150 && isBitPackableType!U) || isBitPackableType!T) 1151 { 1152 import std.math : nextPow2; 1153 private enum bits = bitSizeOf!T; 1154 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1155 } 1156 1157 //unsafe and fast access to a chunk of RAM as if it contains packed values 1158 template PackedPtr(T) 1159 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1160 && isBitPackableType!U) || isBitPackableType!T) 1161 { 1162 import std.math : nextPow2; 1163 private enum bits = bitSizeOf!T; 1164 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1165 } 1166 1167 struct PackedPtrImpl(T, size_t bits) 1168 { 1169 pure nothrow: 1170 static assert(isPow2OrZero(bits)); 1171 1172 this(inout(size_t)* ptr)inout @safe @nogc 1173 { 1174 origin = ptr; 1175 } 1176 1177 private T simpleIndex(size_t n) inout 1178 { 1179 immutable q = n / factor; 1180 immutable r = n % factor; 1181 return cast(T)((origin[q] >> bits*r) & mask); 1182 } 1183 1184 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1185 in 1186 { 1187 static if (isIntegral!T) 1188 assert(val <= mask); 1189 } 1190 do 1191 { 1192 immutable q = n / factor; 1193 immutable r = n % factor; 1194 immutable tgt_shift = bits*r; 1195 immutable word = origin[q]; 1196 origin[q] = (word & ~(mask << tgt_shift)) 1197 | (cast(size_t) val << tgt_shift); 1198 } 1199 1200 static if (factor == bytesPerWord// can safely pack by byte 1201 || factor == 1 // a whole word at a time 1202 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1203 && hasUnalignedReads)) // this needs unaligned reads 1204 { 1205 static if (factor == bytesPerWord) 1206 alias U = ubyte; 1207 else static if (factor == bytesPerWord/2) 1208 alias U = ushort; 1209 else static if (factor == bytesPerWord/4) 1210 alias U = uint; 1211 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1212 alias U = ulong; 1213 1214 T opIndex(size_t idx) inout 1215 { 1216 T ret; 1217 version (LittleEndian) 1218 ret = __ctfe ? simpleIndex(idx) : 1219 cast(inout(T))(cast(U*) origin)[idx]; 1220 else 1221 ret = simpleIndex(idx); 1222 return ret; 1223 } 1224 1225 static if (isBitPacked!T) // lack of user-defined implicit conversion 1226 { 1227 void opIndexAssign(T val, size_t idx) 1228 { 1229 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1230 } 1231 } 1232 1233 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1234 { 1235 version (LittleEndian) 1236 { 1237 if (__ctfe) 1238 simpleWrite(val, idx); 1239 else 1240 (cast(U*) origin)[idx] = cast(U) val; 1241 } 1242 else 1243 simpleWrite(val, idx); 1244 } 1245 } 1246 else 1247 { 1248 T opIndex(size_t n) inout 1249 { 1250 return simpleIndex(n); 1251 } 1252 1253 static if (isBitPacked!T) // lack of user-defined implicit conversion 1254 { 1255 void opIndexAssign(T val, size_t idx) 1256 { 1257 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1258 } 1259 } 1260 1261 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1262 { 1263 return simpleWrite(val, n); 1264 } 1265 } 1266 1267 private: 1268 // factor - number of elements in one machine word 1269 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1270 enum bytesPerWord = size_t.sizeof; 1271 size_t* origin; 1272 } 1273 1274 // data is packed only by power of two sized packs per word, 1275 // thus avoiding mul/div overhead at the cost of ultimate packing 1276 // this construct doesn't own memory, only provides access, see MultiArray for usage 1277 struct PackedArrayViewImpl(T, size_t bits) 1278 { 1279 pure nothrow: 1280 1281 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1282 { 1283 ptr = inout(PackedPtr!(T))(origin); 1284 ofs = offset; 1285 limit = items; 1286 } 1287 1288 bool zeros(size_t s, size_t e) 1289 in 1290 { 1291 assert(s <= e); 1292 } 1293 do 1294 { 1295 s += ofs; 1296 e += ofs; 1297 immutable pad_s = roundUp(s); 1298 if ( s >= e) 1299 { 1300 foreach (i; s .. e) 1301 if (ptr[i]) 1302 return false; 1303 return true; 1304 } 1305 immutable pad_e = roundDown(e); 1306 size_t i; 1307 for (i=s; i<pad_s; i++) 1308 if (ptr[i]) 1309 return false; 1310 // all in between is x*factor elements 1311 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1312 if (ptr.origin[j]) 1313 return false; 1314 for (; i<e; i++) 1315 if (ptr[i]) 1316 return false; 1317 return true; 1318 } 1319 1320 T opIndex(size_t idx) inout 1321 in 1322 { 1323 assert(idx < limit); 1324 } 1325 do 1326 { 1327 return ptr[ofs + idx]; 1328 } 1329 1330 static if (isBitPacked!T) // lack of user-defined implicit conversion 1331 { 1332 void opIndexAssign(T val, size_t idx) 1333 { 1334 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1335 } 1336 } 1337 1338 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1339 in 1340 { 1341 assert(idx < limit); 1342 } 1343 do 1344 { 1345 ptr[ofs + idx] = val; 1346 } 1347 1348 static if (isBitPacked!T) // lack of user-defined implicit conversions 1349 { 1350 void opSliceAssign(T val, size_t start, size_t end) 1351 { 1352 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1353 } 1354 } 1355 1356 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1357 in 1358 { 1359 assert(start <= end); 1360 assert(end <= limit); 1361 } 1362 do 1363 { 1364 // account for ofsetted view 1365 start += ofs; 1366 end += ofs; 1367 // rounded to factor granularity 1368 immutable pad_start = roundUp(start);// rounded up 1369 if (pad_start >= end) //rounded up >= then end of slice 1370 { 1371 //nothing to gain, use per element assignment 1372 foreach (i; start .. end) 1373 ptr[i] = val; 1374 return; 1375 } 1376 immutable pad_end = roundDown(end); // rounded down 1377 size_t i; 1378 for (i=start; i<pad_start; i++) 1379 ptr[i] = val; 1380 // all in between is x*factor elements 1381 if (pad_start != pad_end) 1382 { 1383 immutable repval = replicateBits!(factor, bits)(val); 1384 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1385 ptr.origin[j] = repval;// so speed it up by factor 1386 } 1387 for (; i<end; i++) 1388 ptr[i] = val; 1389 } 1390 1391 auto opSlice(size_t from, size_t to)inout 1392 in 1393 { 1394 assert(from <= to); 1395 assert(ofs + to <= limit); 1396 } 1397 do 1398 { 1399 return typeof(this)(ptr.origin, ofs + from, to - from); 1400 } 1401 1402 auto opSlice(){ return opSlice(0, length); } 1403 1404 bool opEquals(T)(auto ref T arr) const 1405 { 1406 if (limit != arr.limit) 1407 return false; 1408 size_t s1 = ofs, s2 = arr.ofs; 1409 size_t e1 = s1 + limit, e2 = s2 + limit; 1410 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1411 { 1412 return ptr.origin[s1/factor .. e1/factor] 1413 == arr.ptr.origin[s2/factor .. e2/factor]; 1414 } 1415 for (size_t i=0;i<limit; i++) 1416 if (this[i] != arr[i]) 1417 return false; 1418 return true; 1419 } 1420 1421 @property size_t length()const{ return limit; } 1422 1423 private: 1424 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1425 auto roundDown()(size_t val){ return val/factor*factor; } 1426 // factor - number of elements in one machine word 1427 enum factor = size_t.sizeof*8/bits; 1428 PackedPtr!(T) ptr; 1429 size_t ofs, limit; 1430 } 1431 1432 1433 private struct SliceOverIndexed(T) 1434 { 1435 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1436 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1437 auto opIndex(size_t idx)const 1438 in 1439 { 1440 assert(idx < to - from); 1441 } 1442 do 1443 { 1444 return (*arr)[from+idx]; 1445 } 1446 1447 static if (assignableIndex) 1448 void opIndexAssign(Item val, size_t idx) 1449 in 1450 { 1451 assert(idx < to - from); 1452 } 1453 do 1454 { 1455 (*arr)[from+idx] = val; 1456 } 1457 1458 auto opSlice(size_t a, size_t b) 1459 { 1460 return typeof(this)(from+a, from+b, arr); 1461 } 1462 1463 // static if (assignableSlice) 1464 void opSliceAssign(T)(T val, size_t start, size_t end) 1465 { 1466 (*arr)[start+from .. end+from] = val; 1467 } 1468 1469 auto opSlice() 1470 { 1471 return typeof(this)(from, to, arr); 1472 } 1473 1474 @property size_t length()const { return to-from;} 1475 1476 auto opDollar()const { return length; } 1477 1478 @property bool empty()const { return from == to; } 1479 1480 @property auto front()const { return (*arr)[from]; } 1481 1482 static if (assignableIndex) 1483 @property void front(Item val) { (*arr)[from] = val; } 1484 1485 @property auto back()const { return (*arr)[to-1]; } 1486 1487 static if (assignableIndex) 1488 @property void back(Item val) { (*arr)[to-1] = val; } 1489 1490 @property auto save() inout { return this; } 1491 1492 void popFront() { from++; } 1493 1494 void popBack() { to--; } 1495 1496 bool opEquals(T)(auto ref T arr) const 1497 { 1498 if (arr.length != length) 1499 return false; 1500 for (size_t i=0; i <length; i++) 1501 if (this[i] != arr[i]) 1502 return false; 1503 return true; 1504 } 1505 private: 1506 alias Item = typeof(T.init[0]); 1507 size_t from, to; 1508 T* arr; 1509 } 1510 1511 @safe pure nothrow @nogc unittest 1512 { 1513 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1514 } 1515 1516 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1517 if (is(Unqual!T == T)) 1518 { 1519 return SliceOverIndexed!(const(T))(a, b, x); 1520 } 1521 1522 // BUG? inout is out of reach 1523 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1524 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1525 if (is(Unqual!T == T)) 1526 { 1527 return SliceOverIndexed!T(a, b, x); 1528 } 1529 1530 @system unittest 1531 { 1532 int[] idxArray = [2, 3, 5, 8, 13]; 1533 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1534 1535 assert(!sliced.empty); 1536 assert(sliced.front == 2); 1537 sliced.front = 1; 1538 assert(sliced.front == 1); 1539 assert(sliced.back == 13); 1540 sliced.popFront(); 1541 assert(sliced.front == 3); 1542 assert(sliced.back == 13); 1543 sliced.back = 11; 1544 assert(sliced.back == 11); 1545 sliced.popBack(); 1546 1547 assert(sliced.front == 3); 1548 assert(sliced[$-1] == 8); 1549 sliced = sliced[]; 1550 assert(sliced[0] == 3); 1551 assert(sliced.back == 8); 1552 sliced = sliced[1..$]; 1553 assert(sliced.front == 5); 1554 sliced = sliced[0..$-1]; 1555 assert(sliced[$-1] == 5); 1556 1557 int[] other = [2, 5]; 1558 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1559 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1560 assert(idxArray[0 .. 2] == [-1, -1]); 1561 uint[] nullArr = null; 1562 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1563 assert(nullSlice.empty); 1564 } 1565 1566 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1567 { 1568 return inout(PackedArrayView!T)(ptr, 0, items); 1569 } 1570 1571 1572 //============================================================================ 1573 // Partially unrolled binary search using Shar's method 1574 //============================================================================ 1575 1576 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1577 { 1578 import core.bitop : bsr; 1579 import std.array : replace; 1580 import std.conv : to; 1581 assert(isPow2OrZero(size)); 1582 string code = ` 1583 import core.bitop : bsr; 1584 auto power = bsr(m)+1; 1585 switch (power){`; 1586 size_t i = bsr(size); 1587 foreach_reverse (val; 0 .. bsr(size)) 1588 { 1589 auto v = 2^^val; 1590 code ~= ` 1591 case pow: 1592 if (pred(range[idx+m], needle)) 1593 idx += m; 1594 goto case; 1595 `.replace("m", to!string(v)) 1596 .replace("pow", to!string(i)); 1597 i--; 1598 } 1599 code ~= ` 1600 case 0: 1601 if (pred(range[idx], needle)) 1602 idx += 1; 1603 goto default; 1604 `; 1605 code ~= ` 1606 default: 1607 }`; 1608 return code; 1609 } 1610 1611 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1612 { 1613 // See also: std.math.isPowerOf2() 1614 return (sz & (sz-1)) == 0; 1615 } 1616 1617 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1618 if (is(T : ElementType!Range)) 1619 { 1620 assert(isPow2OrZero(range.length)); 1621 size_t idx = 0, m = range.length/2; 1622 while (m != 0) 1623 { 1624 if (pred(range[idx+m], needle)) 1625 idx += m; 1626 m /= 2; 1627 } 1628 if (pred(range[idx], needle)) 1629 idx += 1; 1630 return idx; 1631 } 1632 1633 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1634 if (is(T : ElementType!Range)) 1635 { 1636 assert(isPow2OrZero(range.length)); 1637 size_t idx = 0, m = range.length/2; 1638 enum max = 1 << 10; 1639 while (m >= max) 1640 { 1641 if (pred(range[idx+m], needle)) 1642 idx += m; 1643 m /= 2; 1644 } 1645 mixin(genUnrolledSwitchSearch(max)); 1646 return idx; 1647 } 1648 1649 template sharMethod(alias uniLowerBound) 1650 { 1651 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1652 if (is(T : ElementType!Range)) 1653 { 1654 import std.functional : binaryFun; 1655 import std.math : nextPow2, truncPow2; 1656 alias pred = binaryFun!_pred; 1657 if (range.length == 0) 1658 return 0; 1659 if (isPow2OrZero(range.length)) 1660 return uniLowerBound!pred(range, needle); 1661 size_t n = truncPow2(range.length); 1662 if (pred(range[n-1], needle)) 1663 {// search in another 2^^k area that fully covers the tail of range 1664 size_t k = nextPow2(range.length - n + 1); 1665 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1666 } 1667 else 1668 return uniLowerBound!pred(range[0 .. n], needle); 1669 } 1670 } 1671 1672 alias sharLowerBound = sharMethod!uniformLowerBound; 1673 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1674 1675 @safe unittest 1676 { 1677 import std.array : array; 1678 import std.range : assumeSorted, iota; 1679 1680 auto stdLowerBound(T)(T[] range, T needle) 1681 { 1682 return assumeSorted(range).lowerBound(needle).length; 1683 } 1684 immutable MAX = 5*1173; 1685 auto arr = array(iota(5, MAX, 5)); 1686 assert(arr.length == MAX/5-1); 1687 foreach (i; 0 .. MAX+5) 1688 { 1689 auto st = stdLowerBound(arr, i); 1690 assert(st == sharLowerBound(arr, i)); 1691 assert(st == sharSwitchLowerBound(arr, i)); 1692 } 1693 arr = []; 1694 auto st = stdLowerBound(arr, 33); 1695 assert(st == sharLowerBound(arr, 33)); 1696 assert(st == sharSwitchLowerBound(arr, 33)); 1697 } 1698 //============================================================================ 1699 1700 @safe 1701 { 1702 // hope to see simillar stuff in public interface... once Allocators are out 1703 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1704 1705 @trusted size_t genericReplace(Policy=void, T, Range) 1706 (ref T dest, size_t from, size_t to, Range stuff) 1707 { 1708 import std.algorithm.mutation : copy; 1709 size_t delta = to - from; 1710 size_t stuff_end = from+stuff.length; 1711 if (stuff.length > delta) 1712 {// replace increases length 1713 delta = stuff.length - delta;// now, new is > old by delta 1714 static if (is(Policy == void)) 1715 dest.length = dest.length+delta;//@@@BUG lame @property 1716 else 1717 dest = Policy.realloc(dest, dest.length+delta); 1718 copyBackwards(dest[to .. dest.length-delta], 1719 dest[to+delta .. dest.length]); 1720 copyForward(stuff, dest[from .. stuff_end]); 1721 } 1722 else if (stuff.length == delta) 1723 { 1724 copy(stuff, dest[from .. to]); 1725 } 1726 else 1727 {// replace decreases length by delta 1728 delta = delta - stuff.length; 1729 copy(stuff, dest[from .. stuff_end]); 1730 copyForward(dest[to .. dest.length], 1731 dest[stuff_end .. dest.length-delta]); 1732 static if (is(Policy == void)) 1733 dest.length = dest.length - delta;//@@@BUG lame @property 1734 else 1735 dest = Policy.realloc(dest, dest.length-delta); 1736 } 1737 return stuff_end; 1738 } 1739 1740 1741 // Simple storage manipulation policy 1742 @safe private struct GcPolicy 1743 { 1744 import std.traits : isDynamicArray; 1745 1746 static T[] dup(T)(const T[] arr) 1747 { 1748 return arr.dup; 1749 } 1750 1751 static T[] alloc(T)(size_t size) 1752 { 1753 return new T[size]; 1754 } 1755 1756 static T[] realloc(T)(T[] arr, size_t sz) 1757 { 1758 arr.length = sz; 1759 return arr; 1760 } 1761 1762 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1763 { 1764 replaceInPlace(dest, from, to, stuff); 1765 } 1766 1767 static void append(T, V)(ref T[] arr, V value) 1768 if (!isInputRange!V) 1769 { 1770 arr ~= force!T(value); 1771 } 1772 1773 static void append(T, V)(ref T[] arr, V value) 1774 if (isInputRange!V) 1775 { 1776 insertInPlace(arr, arr.length, value); 1777 } 1778 1779 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1780 if (isDynamicArray!T && is(Unqual!T == T)) 1781 { 1782 debug 1783 { 1784 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1785 } 1786 arr = null; 1787 } 1788 1789 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1790 if (isDynamicArray!T && !is(Unqual!T == T)) 1791 { 1792 arr = null; 1793 } 1794 } 1795 1796 // ditto 1797 @safe struct ReallocPolicy 1798 { 1799 import std.range.primitives : hasLength; 1800 1801 static T[] dup(T)(const T[] arr) 1802 { 1803 auto result = alloc!T(arr.length); 1804 result[] = arr[]; 1805 return result; 1806 } 1807 1808 static T[] alloc(T)(size_t size) @trusted 1809 { 1810 import std.internal.memory : enforceMalloc; 1811 1812 import core.checkedint : mulu; 1813 bool overflow; 1814 size_t nbytes = mulu(size, T.sizeof, overflow); 1815 if (overflow) assert(0); 1816 1817 auto ptr = cast(T*) enforceMalloc(nbytes); 1818 return ptr[0 .. size]; 1819 } 1820 1821 static T[] realloc(T)(scope T[] arr, size_t size) @trusted 1822 { 1823 import std.internal.memory : enforceRealloc; 1824 if (!size) 1825 { 1826 destroy(arr); 1827 return null; 1828 } 1829 1830 import core.checkedint : mulu; 1831 bool overflow; 1832 size_t nbytes = mulu(size, T.sizeof, overflow); 1833 if (overflow) assert(0); 1834 1835 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1836 return ptr[0 .. size]; 1837 } 1838 1839 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1840 { 1841 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1842 } 1843 1844 static void append(T, V)(ref T[] arr, V value) 1845 if (!isInputRange!V) 1846 { 1847 if (arr.length == size_t.max) assert(0); 1848 arr = realloc(arr, arr.length+1); 1849 arr[$-1] = force!T(value); 1850 } 1851 1852 pure @safe unittest 1853 { 1854 int[] arr; 1855 ReallocPolicy.append(arr, 3); 1856 1857 import std.algorithm.comparison : equal; 1858 assert(equal(arr, [3])); 1859 } 1860 1861 static void append(T, V)(ref T[] arr, V value) 1862 if (isInputRange!V && hasLength!V) 1863 { 1864 import core.checkedint : addu; 1865 bool overflow; 1866 size_t nelems = addu(arr.length, value.length, overflow); 1867 if (overflow) assert(0); 1868 1869 arr = realloc(arr, nelems); 1870 1871 import std.algorithm.mutation : copy; 1872 copy(value, arr[$-value.length..$]); 1873 } 1874 1875 pure @safe unittest 1876 { 1877 int[] arr; 1878 ReallocPolicy.append(arr, [1,2,3]); 1879 1880 import std.algorithm.comparison : equal; 1881 assert(equal(arr, [1,2,3])); 1882 } 1883 1884 static void destroy(T)(scope ref T[] arr) @trusted 1885 { 1886 import core.memory : pureFree; 1887 if (arr.ptr) 1888 pureFree(arr.ptr); 1889 arr = null; 1890 } 1891 } 1892 1893 //build hack 1894 alias _RealArray = CowArray!ReallocPolicy; 1895 1896 pure @safe unittest 1897 { 1898 import std.algorithm.comparison : equal; 1899 1900 with(ReallocPolicy) 1901 { 1902 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1903 string file = __FILE__, size_t line = __LINE__) 1904 { 1905 { 1906 replaceImpl(orig, from, to, toReplace); 1907 scope(exit) destroy(orig); 1908 if (!equal(orig, result)) 1909 return false; 1910 } 1911 return true; 1912 } 1913 static T[] arr(T)(T[] args... ) 1914 { 1915 return dup(args); 1916 } 1917 1918 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1919 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1920 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1921 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1922 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1923 } 1924 } 1925 1926 /** 1927 Tests if T is some kind a set of code points. Intended for template constraints. 1928 */ 1929 public template isCodepointSet(T) 1930 { 1931 static if (is(T dummy == InversionList!(Args), Args...)) 1932 enum isCodepointSet = true; 1933 else 1934 enum isCodepointSet = false; 1935 } 1936 1937 /** 1938 Tests if `T` is a pair of integers that implicitly convert to `V`. 1939 The following code must compile for any pair `T`: 1940 --- 1941 (T x){ V a = x[0]; V b = x[1];} 1942 --- 1943 The following must not compile: 1944 --- 1945 (T x){ V c = x[2];} 1946 --- 1947 */ 1948 public template isIntegralPair(T, V=uint) 1949 { 1950 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1951 && !is(typeof((T x){ V c = x[2]; })); 1952 } 1953 1954 1955 /** 1956 The recommended default type for set of $(CODEPOINTS). 1957 For details, see the current implementation: $(LREF InversionList). 1958 */ 1959 public alias CodepointSet = InversionList!GcPolicy; 1960 1961 1962 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1963 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1964 // hence below doesn't seem to work 1965 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1966 1967 /** 1968 The recommended type of $(REF Tuple, std,_typecons) 1969 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1970 Any interval type should pass $(LREF isIntegralPair) trait. 1971 */ 1972 public struct CodepointInterval 1973 { 1974 pure: 1975 uint[2] _tuple; 1976 alias _tuple this; 1977 1978 @safe pure nothrow @nogc: 1979 1980 this(uint low, uint high) 1981 { 1982 _tuple[0] = low; 1983 _tuple[1] = high; 1984 } 1985 bool opEquals(T)(T val) const 1986 { 1987 return this[0] == val[0] && this[1] == val[1]; 1988 } 1989 @property ref inout(uint) a() inout { return _tuple[0]; } 1990 @property ref inout(uint) b() inout { return _tuple[1]; } 1991 } 1992 1993 /** 1994 $(P 1995 `InversionList` is a set of $(CODEPOINTS) 1996 represented as an array of open-right [a, b$(RPAREN) 1997 intervals (see $(LREF CodepointInterval) above). 1998 The name comes from the way the representation reads left to right. 1999 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2000 plus a singular value 60 looks like this: 2001 ) 2002 --- 2003 10, 50, 60, 61, 80, 90 2004 --- 2005 $(P 2006 The way to read this is: start with negative meaning that all numbers 2007 smaller then the next one are not present in this set (and positive - 2008 the contrary). Then switch positive/negative after each 2009 number passed from left to right. 2010 ) 2011 $(P This way negative spans until 10, then positive until 50, 2012 then negative until 60, then positive until 61, and so on. 2013 As seen this provides a space-efficient storage of highly redundant data 2014 that comes in long runs. A description which Unicode $(CHARACTER) 2015 properties fit nicely. The technique itself could be seen as a variation 2016 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2017 ) 2018 2019 $(P Sets are value types (just like `int` is) thus they 2020 are never aliased. 2021 ) 2022 Example: 2023 --- 2024 auto a = CodepointSet('a', 'z'+1); 2025 auto b = CodepointSet('A', 'Z'+1); 2026 auto c = a; 2027 a = a | b; 2028 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2029 assert(a != c); 2030 --- 2031 $(P See also $(LREF unicode) for simpler construction of sets 2032 from predefined ones. 2033 ) 2034 2035 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2036 The value semantics are achieved by using the 2037 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2038 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2039 ) 2040 2041 Note: 2042 $(P It's not recommended to rely on the template parameters 2043 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2044 The type and parameters may change when the standard 2045 allocators design is finalized. 2046 Use $(LREF isCodepointSet) with templates or just stick with the default 2047 alias $(LREF CodepointSet) throughout the whole code base. 2048 ) 2049 */ 2050 public struct InversionList(SP=GcPolicy) 2051 { 2052 import std.range : assumeSorted; 2053 2054 /** 2055 Construct from another code point set of any type. 2056 */ 2057 this(Set)(Set set) pure 2058 if (isCodepointSet!Set) 2059 { 2060 uint[] arr; 2061 foreach (v; set.byInterval) 2062 { 2063 arr ~= v.a; 2064 arr ~= v.b; 2065 } 2066 data = CowArray!(SP).reuse(arr); 2067 } 2068 2069 /** 2070 Construct a set from a forward range of code point intervals. 2071 */ 2072 this(Range)(Range intervals) pure 2073 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2074 { 2075 uint[] arr; 2076 foreach (v; intervals) 2077 { 2078 SP.append(arr, v.a); 2079 SP.append(arr, v.b); 2080 } 2081 data = CowArray!(SP).reuse(arr); 2082 sanitize(); //enforce invariant: sort intervals etc. 2083 } 2084 2085 //helper function that avoids sanity check to be CTFE-friendly 2086 private static fromIntervals(Range)(Range intervals) pure 2087 { 2088 import std.algorithm.iteration : map; 2089 import std.range : roundRobin; 2090 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2091 intervals.save.map!"a[1]"()); 2092 InversionList set; 2093 set.data = CowArray!(SP)(flattened); 2094 return set; 2095 } 2096 //ditto untill sort is CTFE-able 2097 private static fromIntervals()(uint[] intervals...) pure 2098 in 2099 { 2100 import std.conv : text; 2101 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2102 for (uint i = 0; i < intervals.length; i += 2) 2103 { 2104 auto a = intervals[i], b = intervals[i+1]; 2105 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2106 } 2107 } 2108 do 2109 { 2110 InversionList set; 2111 set.data = CowArray!(SP)(intervals); 2112 return set; 2113 } 2114 2115 /** 2116 Construct a set from plain values of code point intervals. 2117 */ 2118 this()(uint[] intervals...) 2119 in 2120 { 2121 import std.conv : text; 2122 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2123 for (uint i = 0; i < intervals.length; i += 2) 2124 { 2125 auto a = intervals[i], b = intervals[i+1]; 2126 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2127 } 2128 } 2129 do 2130 { 2131 data = CowArray!(SP)(intervals); 2132 sanitize(); //enforce invariant: sort intervals etc. 2133 } 2134 2135 /// 2136 pure @safe unittest 2137 { 2138 import std.algorithm.comparison : equal; 2139 2140 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2141 foreach (v; 'a'..'z'+1) 2142 assert(set[v]); 2143 // Cyrillic lowercase interval 2144 foreach (v; 'а'..'я'+1) 2145 assert(set[v]); 2146 //specific order is not required, intervals may interesect 2147 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2148 //the same end result 2149 assert(set2.byInterval.equal(set.byInterval)); 2150 // test constructor this(Range)(Range intervals) 2151 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2152 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2153 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2154 foreach (v; '♔'..'♟'+1) 2155 assert(set3[v]); 2156 } 2157 2158 /** 2159 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2160 */ 2161 @property auto byInterval() scope 2162 { 2163 // TODO: change this to data[] once the -dip1000 errors have been fixed 2164 // see e.g. https://github.com/dlang/phobos/pull/6638 2165 import std.array : array; 2166 return Intervals!(typeof(data.array))(data.array); 2167 } 2168 2169 @safe unittest 2170 { 2171 import std.algorithm.comparison : equal; 2172 import std.typecons : tuple; 2173 2174 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2175 2176 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2177 } 2178 2179 package(std) @property const(CodepointInterval)[] intervals() const 2180 { 2181 import std.array : array; 2182 return Intervals!(typeof(data[]))(data[]).array; 2183 } 2184 2185 /** 2186 Tests the presence of code point `val` in this set. 2187 */ 2188 bool opIndex(uint val) const 2189 { 2190 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2191 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2192 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2193 } 2194 2195 /// 2196 pure @safe unittest 2197 { 2198 auto gothic = unicode.Gothic; 2199 // Gothic letter ahsa 2200 assert(gothic['\U00010330']); 2201 // no ascii in Gothic obviously 2202 assert(!gothic['$']); 2203 } 2204 2205 2206 // Linear scan for `ch`. Useful only for small sets. 2207 // TODO: 2208 // used internally in std.regex 2209 // should be properly exposed in a public API ? 2210 package(std) auto scanFor()(dchar ch) const 2211 { 2212 immutable len = data.length; 2213 for (size_t i = 0; i < len; i++) 2214 if (ch < data[i]) 2215 return i & 1; 2216 return 0; 2217 } 2218 2219 /// Number of $(CODEPOINTS) in this set 2220 @property size_t length() 2221 { 2222 size_t sum = 0; 2223 foreach (iv; byInterval) 2224 { 2225 sum += iv.b - iv.a; 2226 } 2227 return sum; 2228 } 2229 2230 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2231 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2232 //============================================================================ 2233 public: 2234 /** 2235 $(P Sets support natural syntax for set algebra, namely: ) 2236 $(BOOKTABLE , 2237 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2238 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2239 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2240 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2241 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2242 ) 2243 */ 2244 This opBinary(string op, U)(U rhs) 2245 if (isCodepointSet!U || is(U:dchar)) 2246 { 2247 static if (op == "&" || op == "|" || op == "~") 2248 {// symmetric ops thus can swap arguments to reuse r-value 2249 static if (is(U:dchar)) 2250 { 2251 auto tmp = this; 2252 mixin("tmp "~op~"= rhs; "); 2253 return tmp; 2254 } 2255 else 2256 { 2257 static if (is(Unqual!U == U)) 2258 { 2259 // try hard to reuse r-value 2260 mixin("rhs "~op~"= this;"); 2261 return rhs; 2262 } 2263 else 2264 { 2265 auto tmp = this; 2266 mixin("tmp "~op~"= rhs;"); 2267 return tmp; 2268 } 2269 } 2270 } 2271 else static if (op == "-") // anti-symmetric 2272 { 2273 auto tmp = this; 2274 tmp -= rhs; 2275 return tmp; 2276 } 2277 else 2278 static assert(0, "no operator "~op~" defined for Set"); 2279 } 2280 2281 /// 2282 pure @safe unittest 2283 { 2284 import std.algorithm.comparison : equal; 2285 import std.range : iota; 2286 2287 auto lower = unicode.LowerCase; 2288 auto upper = unicode.UpperCase; 2289 auto ascii = unicode.ASCII; 2290 2291 assert((lower & upper).empty); // no intersection 2292 auto lowerASCII = lower & ascii; 2293 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2294 // throw away all of the lowercase ASCII 2295 assert((ascii - lower).length == 128 - 26); 2296 2297 auto onlyOneOf = lower ~ ascii; 2298 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2299 assert(onlyOneOf['$']); // ASCII and not lowercase 2300 assert(!onlyOneOf['a']); // ASCII and lowercase 2301 assert(onlyOneOf['я']); // not ASCII but lowercase 2302 2303 // throw away all cased letters from ASCII 2304 auto noLetters = ascii - (lower | upper); 2305 assert(noLetters.length == 128 - 26*2); 2306 } 2307 2308 /// The 'op=' versions of the above overloaded operators. 2309 ref This opOpAssign(string op, U)(U rhs) 2310 if (isCodepointSet!U || is(U:dchar)) 2311 { 2312 static if (op == "|") // union 2313 { 2314 static if (is(U:dchar)) 2315 { 2316 this.addInterval(rhs, rhs+1); 2317 return this; 2318 } 2319 else 2320 return this.add(rhs); 2321 } 2322 else static if (op == "&") // intersection 2323 return this.intersect(rhs);// overloaded 2324 else static if (op == "-") // set difference 2325 return this.sub(rhs);// overloaded 2326 else static if (op == "~") // symmetric set difference 2327 { 2328 auto copy = this & rhs; 2329 this |= rhs; 2330 this -= copy; 2331 return this; 2332 } 2333 else 2334 static assert(0, "no operator "~op~" defined for Set"); 2335 } 2336 2337 /** 2338 Tests the presence of codepoint `ch` in this set, 2339 the same as $(LREF opIndex). 2340 */ 2341 bool opBinaryRight(string op: "in", U)(U ch) const 2342 if (is(U : dchar)) 2343 { 2344 return this[ch]; 2345 } 2346 2347 /// 2348 pure @safe unittest 2349 { 2350 assert('я' in unicode.Cyrillic); 2351 assert(!('z' in unicode.Cyrillic)); 2352 } 2353 2354 2355 2356 /** 2357 * Obtains a set that is the inversion of this set. 2358 * 2359 * See_Also: $(LREF inverted) 2360 */ 2361 auto opUnary(string op: "!")() 2362 { 2363 return this.inverted; 2364 } 2365 2366 /** 2367 A range that spans each $(CODEPOINT) in this set. 2368 */ 2369 @property auto byCodepoint() 2370 { 2371 static struct CodepointRange 2372 { 2373 this(This set) 2374 { 2375 r = set.byInterval; 2376 if (!r.empty) 2377 cur = r.front.a; 2378 } 2379 2380 @property dchar front() const 2381 { 2382 return cast(dchar) cur; 2383 } 2384 2385 @property bool empty() const 2386 { 2387 return r.empty; 2388 } 2389 2390 void popFront() 2391 { 2392 cur++; 2393 while (cur >= r.front.b) 2394 { 2395 r.popFront(); 2396 if (r.empty) 2397 break; 2398 cur = r.front.a; 2399 } 2400 } 2401 private: 2402 uint cur; 2403 typeof(This.init.byInterval) r; 2404 } 2405 2406 return CodepointRange(this); 2407 } 2408 2409 /// 2410 pure @safe unittest 2411 { 2412 import std.algorithm.comparison : equal; 2413 import std.range : iota; 2414 2415 auto set = unicode.ASCII; 2416 set.byCodepoint.equal(iota(0, 0x80)); 2417 } 2418 2419 /** 2420 $(P Obtain textual representation of this set in from of 2421 open-right intervals and feed it to `sink`. 2422 ) 2423 $(P Used by various standard formatting facilities such as 2424 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2425 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2426 ) 2427 Example: 2428 --- 2429 import std.conv; 2430 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2431 --- 2432 */ 2433 2434 private import std.format : FormatSpec; 2435 2436 /*************************************** 2437 * Obtain a textual representation of this InversionList 2438 * in form of open-right intervals. 2439 * 2440 * The formatting flag is applied individually to each value, for example: 2441 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2442 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2443 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2444 */ 2445 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2446 { 2447 import std.format : formatValue; 2448 auto range = byInterval; 2449 if (range.empty) 2450 return; 2451 2452 while (1) 2453 { 2454 auto i = range.front; 2455 range.popFront(); 2456 2457 put(sink, "["); 2458 formatValue(sink, i.a, fmt); 2459 put(sink, ".."); 2460 formatValue(sink, i.b, fmt); 2461 put(sink, ")"); 2462 if (range.empty) return; 2463 put(sink, " "); 2464 } 2465 } 2466 2467 /// 2468 pure @safe unittest 2469 { 2470 import std.conv : to; 2471 import std.format : format; 2472 import std.uni : unicode; 2473 2474 assert(unicode.Cyrillic.to!string == 2475 "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)"); 2476 2477 // The specs '%s' and '%d' are equivalent to the to!string call above. 2478 assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string); 2479 2480 assert(format("%#x", unicode.Cyrillic) == 2481 "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) " 2482 ~"[0xa640..0xa698) [0xa69f..0xa6a0)"); 2483 2484 assert(format("%#X", unicode.Cyrillic) == 2485 "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) " 2486 ~"[0XA640..0XA698) [0XA69F..0XA6A0)"); 2487 } 2488 2489 pure @safe unittest 2490 { 2491 import std.exception : assertThrown; 2492 import std.format : format, FormatException; 2493 assertThrown!FormatException(format("%a", unicode.ASCII)); 2494 } 2495 2496 2497 /** 2498 Add an interval [a, b$(RPAREN) to this set. 2499 */ 2500 ref add()(uint a, uint b) 2501 { 2502 addInterval(a, b); 2503 return this; 2504 } 2505 2506 /// 2507 pure @safe unittest 2508 { 2509 CodepointSet someSet; 2510 someSet.add('0', '5').add('A','Z'+1); 2511 someSet.add('5', '9'+1); 2512 assert(someSet['0']); 2513 assert(someSet['5']); 2514 assert(someSet['9']); 2515 assert(someSet['Z']); 2516 } 2517 2518 private: 2519 2520 package(std) // used from: std.regex.internal.parser 2521 ref intersect(U)(U rhs) 2522 if (isCodepointSet!U) 2523 { 2524 Marker mark; 2525 foreach ( i; rhs.byInterval) 2526 { 2527 mark = this.dropUpTo(i.a, mark); 2528 mark = this.skipUpTo(i.b, mark); 2529 } 2530 this.dropUpTo(uint.max, mark); 2531 return this; 2532 } 2533 2534 ref intersect()(dchar ch) 2535 { 2536 foreach (i; byInterval) 2537 if (i.a <= ch && ch < i.b) 2538 return this = This.init.add(ch, ch+1); 2539 this = This.init; 2540 return this; 2541 } 2542 2543 pure @safe unittest 2544 { 2545 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2546 } 2547 2548 ref sub()(dchar ch) 2549 { 2550 return subChar(ch); 2551 } 2552 2553 // same as the above except that skip & drop parts are swapped 2554 package(std) // used from: std.regex.internal.parser 2555 ref sub(U)(U rhs) 2556 if (isCodepointSet!U) 2557 { 2558 Marker mark; 2559 foreach (i; rhs.byInterval) 2560 { 2561 mark = this.skipUpTo(i.a, mark); 2562 mark = this.dropUpTo(i.b, mark); 2563 } 2564 return this; 2565 } 2566 2567 package(std) // used from: std.regex.internal.parse 2568 ref add(U)(U rhs) 2569 if (isCodepointSet!U) 2570 { 2571 Marker start; 2572 foreach (i; rhs.byInterval) 2573 { 2574 start = addInterval(i.a, i.b, start); 2575 } 2576 return this; 2577 } 2578 2579 // end of mixin-able part 2580 //============================================================================ 2581 public: 2582 /** 2583 Obtains a set that is the inversion of this set. 2584 2585 See the '!' $(LREF opUnary) for the same but using operators. 2586 */ 2587 @property auto inverted() 2588 { 2589 InversionList inversion = this; 2590 if (inversion.data.length == 0) 2591 { 2592 inversion.addInterval(0, lastDchar+1); 2593 return inversion; 2594 } 2595 if (inversion.data[0] != 0) 2596 genericReplace(inversion.data, 0, 0, [0]); 2597 else 2598 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2599 if (data[data.length-1] != lastDchar+1) 2600 genericReplace(inversion.data, 2601 inversion.data.length, inversion.data.length, [lastDchar+1]); 2602 else 2603 genericReplace(inversion.data, 2604 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2605 2606 return inversion; 2607 } 2608 2609 /// 2610 pure @safe unittest 2611 { 2612 auto set = unicode.ASCII; 2613 // union with the inverse gets all of the code points in the Unicode 2614 assert((set | set.inverted).length == 0x110000); 2615 // no intersection with the inverse 2616 assert((set & set.inverted).empty); 2617 } 2618 2619 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2620 { 2621 import std.algorithm.searching : countUntil; 2622 import std.format : format; 2623 enum maxBinary = 3; 2624 static string linearScope(R)(R ivals, string indent) 2625 { 2626 string result = indent~"{\n"; 2627 string deeper = indent~" "; 2628 foreach (ival; ivals) 2629 { 2630 immutable span = ival[1] - ival[0]; 2631 assert(span != 0); 2632 if (span == 1) 2633 { 2634 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2635 } 2636 else if (span == 2) 2637 { 2638 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2639 deeper, ival[0], ival[0]+1); 2640 } 2641 else 2642 { 2643 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2644 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2645 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2646 } 2647 } 2648 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2649 return result; 2650 } 2651 2652 static string binaryScope(R)(R ivals, string indent) @safe 2653 { 2654 // time to do unrolled comparisons? 2655 if (ivals.length < maxBinary) 2656 return linearScope(ivals, indent); 2657 else 2658 return bisect(ivals, ivals.length/2, indent); 2659 } 2660 2661 // not used yet if/elsebinary search is far better with DMD as of 2.061 2662 // and GDC is doing fine job either way 2663 static string switchScope(R)(R ivals, string indent) 2664 { 2665 string result = indent~"switch (ch){\n"; 2666 string deeper = indent~" "; 2667 foreach (ival; ivals) 2668 { 2669 if (ival[0]+1 == ival[1]) 2670 { 2671 result ~= format("%scase %s: return true;\n", 2672 deeper, ival[0]); 2673 } 2674 else 2675 { 2676 result ~= format("%scase %s: .. case %s: return true;\n", 2677 deeper, ival[0], ival[1]-1); 2678 } 2679 } 2680 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2681 return result; 2682 } 2683 2684 static string bisect(R)(R range, size_t idx, string indent) 2685 { 2686 string deeper = indent ~ " "; 2687 // bisect on one [a, b) interval at idx 2688 string result = indent~"{\n"; 2689 // less branch, < a 2690 result ~= format("%sif (ch < %s)\n%s", 2691 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2692 // middle point, >= a && < b 2693 result ~= format("%selse if (ch < %s) return true;\n", 2694 deeper, range[idx][1]); 2695 // greater or equal branch, >= b 2696 result ~= format("%selse\n%s", 2697 deeper, binaryScope(range[idx+1..$], deeper)); 2698 return result~indent~"}\n"; 2699 } 2700 2701 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2702 funcName.empty ? "function" : funcName); 2703 // special case first bisection to be on ASCII vs beyond 2704 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2705 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2706 code ~= binaryScope(range, ""); 2707 else 2708 code ~= bisect(range, tillAscii, ""); 2709 return code; 2710 } 2711 2712 /** 2713 Generates string with D source code of unary function with name of 2714 `funcName` taking a single `dchar` argument. If `funcName` is empty 2715 the code is adjusted to be a lambda function. 2716 2717 The function generated tests if the $(CODEPOINT) passed 2718 belongs to this set or not. The result is to be used with string mixin. 2719 The intended usage area is aggressive optimization via meta programming 2720 in parser generators and the like. 2721 2722 Note: Use with care for relatively small or regular sets. It 2723 could end up being slower then just using multi-staged tables. 2724 2725 Example: 2726 --- 2727 import std.stdio; 2728 2729 // construct set directly from [a, b$RPAREN intervals 2730 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2731 writeln(set); 2732 writeln(set.toSourceCode("func")); 2733 --- 2734 2735 The above outputs something along the lines of: 2736 --- 2737 bool func(dchar ch) @safe pure nothrow @nogc 2738 { 2739 if (ch < 45) 2740 { 2741 if (ch == 10 || ch == 11) return true; 2742 return false; 2743 } 2744 else if (ch < 65) return true; 2745 else 2746 { 2747 if (ch < 100) return false; 2748 if (ch < 200) return true; 2749 return false; 2750 } 2751 } 2752 --- 2753 */ 2754 string toSourceCode(string funcName="") 2755 { 2756 import std.array : array; 2757 auto range = byInterval.array(); 2758 return toSourceCode(range, funcName); 2759 } 2760 2761 /** 2762 True if this set doesn't contain any $(CODEPOINTS). 2763 */ 2764 @property bool empty() const 2765 { 2766 return data.length == 0; 2767 } 2768 2769 /// 2770 pure @safe unittest 2771 { 2772 CodepointSet emptySet; 2773 assert(emptySet.length == 0); 2774 assert(emptySet.empty); 2775 } 2776 2777 private: 2778 alias This = typeof(this); 2779 alias Marker = size_t; 2780 2781 // a random-access range of integral pairs 2782 static struct Intervals(Range) 2783 { 2784 import std.range.primitives : hasAssignableElements; 2785 2786 this(Range sp) scope 2787 { 2788 slice = sp; 2789 start = 0; 2790 end = sp.length; 2791 } 2792 2793 this(Range sp, size_t s, size_t e) scope 2794 { 2795 slice = sp; 2796 start = s; 2797 end = e; 2798 } 2799 2800 @property auto front()const 2801 { 2802 immutable a = slice[start]; 2803 immutable b = slice[start+1]; 2804 return CodepointInterval(a, b); 2805 } 2806 2807 //may break sorted property - but we need std.sort to access it 2808 //hence package(std) protection attribute 2809 static if (hasAssignableElements!Range) 2810 package(std) @property void front(CodepointInterval val) 2811 { 2812 slice[start] = val.a; 2813 slice[start+1] = val.b; 2814 } 2815 2816 @property auto back()const 2817 { 2818 immutable a = slice[end-2]; 2819 immutable b = slice[end-1]; 2820 return CodepointInterval(a, b); 2821 } 2822 2823 //ditto about package 2824 static if (hasAssignableElements!Range) 2825 package(std) @property void back(CodepointInterval val) 2826 { 2827 slice[end-2] = val.a; 2828 slice[end-1] = val.b; 2829 } 2830 2831 void popFront() 2832 { 2833 start += 2; 2834 } 2835 2836 void popBack() 2837 { 2838 end -= 2; 2839 } 2840 2841 auto opIndex(size_t idx) const 2842 { 2843 immutable a = slice[start+idx*2]; 2844 immutable b = slice[start+idx*2+1]; 2845 return CodepointInterval(a, b); 2846 } 2847 2848 //ditto about package 2849 static if (hasAssignableElements!Range) 2850 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2851 { 2852 slice[start+idx*2] = val.a; 2853 slice[start+idx*2+1] = val.b; 2854 } 2855 2856 auto opSlice(size_t s, size_t e) 2857 { 2858 return Intervals(slice, s*2+start, e*2+start); 2859 } 2860 2861 @property size_t length()const { return slice.length/2; } 2862 2863 @property bool empty()const { return start == end; } 2864 2865 @property auto save(){ return this; } 2866 private: 2867 size_t start, end; 2868 Range slice; 2869 } 2870 2871 // called after construction from intervals 2872 // to make sure invariants hold 2873 void sanitize() 2874 { 2875 import std.algorithm.comparison : max; 2876 import std.algorithm.mutation : SwapStrategy; 2877 import std.algorithm.sorting : sort; 2878 if (data.length == 0) 2879 return; 2880 alias Ival = CodepointInterval; 2881 //intervals wrapper for a _range_ over packed array 2882 auto ivals = Intervals!(typeof(data[]))(data[]); 2883 //@@@BUG@@@ can't use "a.a < b.a" see 2884 // https://issues.dlang.org/show_bug.cgi?id=12265 2885 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2886 // what follows is a variation on stable remove 2887 // differences: 2888 // - predicate is binary, and is tested against 2889 // the last kept element (at 'i'). 2890 // - predicate mutates lhs (merges rhs into lhs) 2891 size_t len = ivals.length; 2892 size_t i = 0; 2893 size_t j = 1; 2894 while (j < len) 2895 { 2896 if (ivals[i].b >= ivals[j].a) 2897 { 2898 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2899 j++; 2900 } 2901 else //unmergable 2902 { 2903 // check if there is a hole after merges 2904 // (in the best case we do 0 writes to ivals) 2905 if (j != i+1) 2906 ivals[i+1] = ivals[j]; //copy over 2907 i++; 2908 j++; 2909 } 2910 } 2911 len = i + 1; 2912 for (size_t k=0; k + 1 < len; k++) 2913 { 2914 assert(ivals[k].a < ivals[k].b); 2915 assert(ivals[k].b < ivals[k+1].a); 2916 } 2917 data.length = len * 2; 2918 } 2919 2920 // special case for normal InversionList 2921 ref subChar(dchar ch) 2922 { 2923 auto mark = skipUpTo(ch); 2924 if (mark != data.length 2925 && data[mark] == ch && data[mark-1] == ch) 2926 { 2927 // it has split, meaning that ch happens to be in one of intervals 2928 data[mark] = data[mark]+1; 2929 } 2930 return this; 2931 } 2932 2933 // 2934 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2935 in 2936 { 2937 assert(a <= b); 2938 } 2939 do 2940 { 2941 import std.range : assumeSorted, SearchPolicy; 2942 auto range = assumeSorted(data[]); 2943 size_t pos; 2944 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2945 if (a_idx == range.length) 2946 { 2947 // [---+++----++++----++++++] 2948 // [ a b] 2949 data.append(a, b); 2950 return data.length-1; 2951 } 2952 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2953 uint[3] buf = void; 2954 uint to_insert; 2955 debug(std_uni) 2956 { 2957 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2958 } 2959 if (b_idx == range.length) 2960 { 2961 // [-------++++++++----++++++-] 2962 // [ s a b] 2963 if (a_idx & 1)// a in positive 2964 { 2965 buf[0] = b; 2966 to_insert = 1; 2967 } 2968 else// a in negative 2969 { 2970 buf[0] = a; 2971 buf[1] = b; 2972 to_insert = 2; 2973 } 2974 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2975 return pos - 1; 2976 } 2977 2978 uint top = data[b_idx]; 2979 2980 debug(std_uni) 2981 { 2982 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2983 writefln("a=%s; b=%s; top=%s;", a, b, top); 2984 } 2985 if (a_idx & 1) 2986 {// a in positive 2987 if (b_idx & 1)// b in positive 2988 { 2989 // [-------++++++++----++++++-] 2990 // [ s a b ] 2991 buf[0] = top; 2992 to_insert = 1; 2993 } 2994 else // b in negative 2995 { 2996 // [-------++++++++----++++++-] 2997 // [ s a b ] 2998 if (top == b) 2999 { 3000 assert(b_idx+1 < data.length); 3001 buf[0] = data[b_idx+1]; 3002 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3003 return pos - 1; 3004 } 3005 buf[0] = b; 3006 buf[1] = top; 3007 to_insert = 2; 3008 } 3009 } 3010 else 3011 { // a in negative 3012 if (b_idx & 1) // b in positive 3013 { 3014 // [----------+++++----++++++-] 3015 // [ a b ] 3016 buf[0] = a; 3017 buf[1] = top; 3018 to_insert = 2; 3019 } 3020 else// b in negative 3021 { 3022 // [----------+++++----++++++-] 3023 // [ a s b ] 3024 if (top == b) 3025 { 3026 assert(b_idx+1 < data.length); 3027 buf[0] = a; 3028 buf[1] = data[b_idx+1]; 3029 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3030 return pos - 1; 3031 } 3032 buf[0] = a; 3033 buf[1] = b; 3034 buf[2] = top; 3035 to_insert = 3; 3036 } 3037 } 3038 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3039 debug(std_uni) 3040 { 3041 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3042 writeln("inserting ", buf[0 .. to_insert]); 3043 } 3044 return pos - 1; 3045 } 3046 3047 // 3048 Marker dropUpTo(uint a, Marker pos=Marker.init) 3049 in 3050 { 3051 assert(pos % 2 == 0); // at start of interval 3052 } 3053 do 3054 { 3055 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3056 if (range.empty) 3057 return pos; 3058 size_t idx = pos; 3059 idx += range.lowerBound(a).length; 3060 3061 debug(std_uni) 3062 { 3063 writeln("dropUpTo full length=", data.length); 3064 writeln(pos,"~~~", idx); 3065 } 3066 if (idx == data.length) 3067 return genericReplace(data, pos, idx, cast(uint[])[]); 3068 if (idx & 1) 3069 { // a in positive 3070 //[--+++----++++++----+++++++------...] 3071 // |<---si s a t 3072 genericReplace(data, pos, idx, [a]); 3073 } 3074 else 3075 { // a in negative 3076 //[--+++----++++++----+++++++-------+++...] 3077 // |<---si s a t 3078 genericReplace(data, pos, idx, cast(uint[])[]); 3079 } 3080 return pos; 3081 } 3082 3083 // 3084 Marker skipUpTo(uint a, Marker pos=Marker.init) 3085 out(result) 3086 { 3087 assert(result % 2 == 0);// always start of interval 3088 //(may be 0-width after-split) 3089 } 3090 do 3091 { 3092 assert(data.length % 2 == 0); 3093 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3094 size_t idx = pos+range.lowerBound(a).length; 3095 3096 if (idx >= data.length) // could have Marker point to recently removed stuff 3097 return data.length; 3098 3099 if (idx & 1)// inside of interval, check for split 3100 { 3101 3102 immutable top = data[idx]; 3103 if (top == a)// no need to split, it's end 3104 return idx+1; 3105 immutable start = data[idx-1]; 3106 if (a == start) 3107 return idx-1; 3108 // split it up 3109 genericReplace(data, idx, idx+1, [a, a, top]); 3110 return idx+1; // avoid odd index 3111 } 3112 return idx; 3113 } 3114 3115 CowArray!SP data; 3116 } 3117 3118 pure @system unittest 3119 { 3120 import std.conv : to; 3121 assert(unicode.ASCII.to!string() == "[0..128)"); 3122 } 3123 3124 // pedantic version for ctfe, and aligned-access only architectures 3125 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3126 { 3127 idx *= 3; 3128 version (LittleEndian) 3129 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3130 + (cast(uint) ptr[idx+2]<<16); 3131 else 3132 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3133 + ptr[idx+2]; 3134 } 3135 3136 // ditto 3137 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3138 { 3139 idx *= 3; 3140 version (LittleEndian) 3141 { 3142 ptr[idx] = val & 0xFF; 3143 ptr[idx+1] = (val >> 8) & 0xFF; 3144 ptr[idx+2] = (val >> 16) & 0xFF; 3145 } 3146 else 3147 { 3148 ptr[idx] = (val >> 16) & 0xFF; 3149 ptr[idx+1] = (val >> 8) & 0xFF; 3150 ptr[idx+2] = val & 0xFF; 3151 } 3152 } 3153 3154 // unaligned x86-like read/write functions 3155 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3156 { 3157 uint* src = cast(uint*)(ptr+3*idx); 3158 version (LittleEndian) 3159 return *src & 0xFF_FFFF; 3160 else 3161 return *src >> 8; 3162 } 3163 3164 // ditto 3165 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3166 { 3167 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3168 version (LittleEndian) 3169 *dest = val | (*dest & 0xFF00_0000); 3170 else 3171 *dest = (val << 8) | (*dest & 0xFF); 3172 } 3173 3174 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3175 { 3176 static if (hasUnalignedReads) 3177 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3178 else 3179 return safeRead24(ptr, idx); 3180 } 3181 3182 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3183 { 3184 static if (hasUnalignedReads) 3185 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3186 else 3187 return safeWrite24(ptr, val, idx); 3188 } 3189 3190 struct CowArray(SP=GcPolicy) 3191 { 3192 import std.range.primitives : hasLength; 3193 3194 @safe: 3195 static auto reuse(uint[] arr) 3196 { 3197 CowArray cow; 3198 cow.data = arr; 3199 SP.append(cow.data, 1); 3200 assert(cow.refCount == 1); 3201 assert(cow.length == arr.length); 3202 return cow; 3203 } 3204 3205 this(Range)(Range range) 3206 if (isInputRange!Range && hasLength!Range) 3207 { 3208 import std.algorithm.mutation : copy; 3209 length = range.length; 3210 copy(range, data[0..$-1]); 3211 } 3212 3213 this(Range)(Range range) 3214 if (isForwardRange!Range && !hasLength!Range) 3215 { 3216 import std.algorithm.mutation : copy; 3217 import std.range.primitives : walkLength; 3218 immutable len = walkLength(range.save); 3219 length = len; 3220 copy(range, data[0..$-1]); 3221 } 3222 3223 this(this) 3224 { 3225 if (!empty) 3226 { 3227 refCount = refCount + 1; 3228 } 3229 } 3230 3231 ~this() 3232 { 3233 if (!empty) 3234 { 3235 immutable cnt = refCount; 3236 if (cnt == 1) 3237 SP.destroy(data); 3238 else 3239 refCount = cnt - 1; 3240 } 3241 } 3242 3243 // no ref-count for empty U24 array 3244 @property bool empty() const { return data.length == 0; } 3245 3246 // report one less then actual size 3247 @property size_t length() const 3248 { 3249 return data.length ? data.length - 1 : 0; 3250 } 3251 3252 //+ an extra slot for ref-count 3253 @property void length(size_t len) 3254 { 3255 import std.algorithm.comparison : min; 3256 import std.algorithm.mutation : copy; 3257 if (len == 0) 3258 { 3259 if (!empty) 3260 freeThisReference(); 3261 return; 3262 } 3263 immutable total = len + 1; // including ref-count 3264 if (empty) 3265 { 3266 data = SP.alloc!uint(total); 3267 refCount = 1; 3268 return; 3269 } 3270 immutable cur_cnt = refCount; 3271 if (cur_cnt != 1) // have more references to this memory 3272 { 3273 refCount = cur_cnt - 1; 3274 auto new_data = SP.alloc!uint(total); 3275 // take shrinking into account 3276 auto to_copy = min(total, data.length) - 1; 3277 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3278 data = new_data; // before setting refCount! 3279 refCount = 1; 3280 } 3281 else // 'this' is the only reference 3282 { 3283 // use the realloc (hopefully in-place operation) 3284 data = SP.realloc(data, total); 3285 refCount = 1; // setup a ref-count in the new end of the array 3286 } 3287 } 3288 3289 alias opDollar = length; 3290 3291 uint opIndex()(size_t idx)const 3292 { 3293 return data[idx]; 3294 } 3295 3296 void opIndexAssign(uint val, size_t idx) 3297 { 3298 auto cnt = refCount; 3299 if (cnt != 1) 3300 dupThisReference(cnt); 3301 data[idx] = val; 3302 } 3303 3304 // 3305 auto opSlice(size_t from, size_t to) 3306 { 3307 if (!empty) 3308 { 3309 auto cnt = refCount; 3310 if (cnt != 1) 3311 dupThisReference(cnt); 3312 } 3313 return data[from .. to]; 3314 3315 } 3316 3317 // 3318 auto opSlice(size_t from, size_t to) const 3319 { 3320 return data[from .. to]; 3321 } 3322 3323 // length slices before the ref count 3324 auto opSlice() 3325 { 3326 return opSlice(0, length); 3327 } 3328 3329 // ditto 3330 auto opSlice() const 3331 { 3332 return opSlice(0, length); 3333 } 3334 3335 void append(Range)(Range range) 3336 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3337 { 3338 size_t nl = length + range.length; 3339 length = nl; 3340 copy(range, this[nl-range.length .. nl]); 3341 } 3342 3343 void append()(uint[] val...) 3344 { 3345 length = length + val.length; 3346 data[$-val.length-1 .. $-1] = val[]; 3347 } 3348 3349 bool opEquals()(auto const ref CowArray rhs)const 3350 { 3351 if (empty ^ rhs.empty) 3352 return false; // one is empty and the other isn't 3353 return empty || data[0..$-1] == rhs.data[0..$-1]; 3354 } 3355 3356 private: 3357 // ref-count is right after the data 3358 @property uint refCount() const 3359 { 3360 return data[$-1]; 3361 } 3362 3363 @property void refCount(uint cnt) 3364 { 3365 data[$-1] = cnt; 3366 } 3367 3368 void freeThisReference() 3369 { 3370 immutable count = refCount; 3371 if (count != 1) // have more references to this memory 3372 { 3373 // dec shared ref-count 3374 refCount = count - 1; 3375 data = []; 3376 } 3377 else 3378 SP.destroy(data); 3379 assert(!data.ptr); 3380 } 3381 3382 void dupThisReference(uint count) 3383 in 3384 { 3385 assert(!empty && count != 1 && count == refCount); 3386 } 3387 do 3388 { 3389 import std.algorithm.mutation : copy; 3390 // dec shared ref-count 3391 refCount = count - 1; 3392 // copy to the new chunk of RAM 3393 auto new_data = SP.alloc!uint(data.length); 3394 // bit-blit old stuff except the counter 3395 copy(data[0..$-1], new_data[0..$-1]); 3396 data = new_data; // before setting refCount! 3397 refCount = 1; // so that this updates the right one 3398 } 3399 3400 uint[] data; 3401 } 3402 3403 pure @safe unittest// Uint24 tests 3404 { 3405 import std.algorithm.comparison : equal; 3406 import std.algorithm.mutation : copy; 3407 import std.conv : text; 3408 import std.range : iota, chain; 3409 import std.range.primitives : isBidirectionalRange, isOutputRange; 3410 void funcRef(T)(ref T u24) 3411 { 3412 u24.length = 2; 3413 u24[1] = 1024; 3414 T u24_c = u24; 3415 assert(u24[1] == 1024); 3416 u24.length = 0; 3417 assert(u24.empty); 3418 u24.append([1, 2]); 3419 assert(equal(u24[], [1, 2])); 3420 u24.append(111); 3421 assert(equal(u24[], [1, 2, 111])); 3422 assert(!u24_c.empty && u24_c[1] == 1024); 3423 u24.length = 3; 3424 copy(iota(0, 3), u24[]); 3425 assert(equal(u24[], iota(0, 3))); 3426 assert(u24_c[1] == 1024); 3427 } 3428 3429 void func2(T)(T u24) 3430 { 3431 T u24_2 = u24; 3432 T u24_3; 3433 u24_3 = u24_2; 3434 assert(u24_2 == u24_3); 3435 assert(equal(u24[], u24_2[])); 3436 assert(equal(u24_2[], u24_3[])); 3437 funcRef(u24_3); 3438 3439 assert(equal(u24_3[], iota(0, 3))); 3440 assert(!equal(u24_2[], u24_3[])); 3441 assert(equal(u24_2[], u24[])); 3442 u24_2 = u24_3; 3443 assert(equal(u24_2[], iota(0, 3))); 3444 // to test that passed arg is intact outside 3445 // plus try out opEquals 3446 u24 = u24_3; 3447 u24 = T.init; 3448 u24_3 = T.init; 3449 assert(u24.empty); 3450 assert(u24 == u24_3); 3451 assert(u24 != u24_2); 3452 } 3453 3454 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3455 {{ 3456 alias Range = typeof(CowArray!Policy.init[]); 3457 alias U24A = CowArray!Policy; 3458 static assert(isForwardRange!Range); 3459 static assert(isBidirectionalRange!Range); 3460 static assert(isOutputRange!(Range, uint)); 3461 static assert(isRandomAccessRange!(Range)); 3462 3463 auto arr = U24A([42u, 36, 100]); 3464 assert(arr[0] == 42); 3465 assert(arr[1] == 36); 3466 arr[0] = 72; 3467 arr[1] = 0xFE_FEFE; 3468 assert(arr[0] == 72); 3469 assert(arr[1] == 0xFE_FEFE); 3470 assert(arr[2] == 100); 3471 U24A arr2 = arr; 3472 assert(arr2[0] == 72); 3473 arr2[0] = 11; 3474 // test COW-ness 3475 assert(arr[0] == 72); 3476 assert(arr2[0] == 11); 3477 // set this to about 100M to stress-test COW memory management 3478 foreach (v; 0 .. 10_000) 3479 func2(arr); 3480 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3481 3482 auto r2 = U24A(iota(0, 100)); 3483 assert(equal(r2[], iota(0, 100)), text(r2[])); 3484 copy(iota(10, 170, 2), r2[10 .. 90]); 3485 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3486 , text(r2[])); 3487 }} 3488 } 3489 3490 pure @safe unittest// core set primitives test 3491 { 3492 import std.conv : text; 3493 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3494 foreach (CodeList; AllSets) 3495 { 3496 CodeList a; 3497 //"plug a hole" test 3498 a.add(10, 20).add(25, 30).add(15, 27); 3499 assert(a == CodeList(10, 30), text(a)); 3500 3501 auto x = CodeList.init; 3502 x.add(10, 20).add(30, 40).add(50, 60); 3503 3504 a = x; 3505 a.add(20, 49);//[10, 49) [50, 60) 3506 assert(a == CodeList(10, 49, 50 ,60)); 3507 3508 a = x; 3509 a.add(20, 50); 3510 assert(a == CodeList(10, 60), text(a)); 3511 3512 // simple unions, mostly edge effects 3513 x = CodeList.init; 3514 x.add(10, 20).add(40, 60); 3515 3516 a = x; 3517 a.add(10, 25); //[10, 25) [40, 60) 3518 assert(a == CodeList(10, 25, 40, 60)); 3519 3520 a = x; 3521 a.add(5, 15); //[5, 20) [40, 60) 3522 assert(a == CodeList(5, 20, 40, 60)); 3523 3524 a = x; 3525 a.add(0, 10); // [0, 20) [40, 60) 3526 assert(a == CodeList(0, 20, 40, 60)); 3527 3528 a = x; 3529 a.add(0, 5); // prepand 3530 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3531 3532 a = x; 3533 a.add(5, 20); 3534 assert(a == CodeList(5, 20, 40, 60)); 3535 3536 a = x; 3537 a.add(3, 37); 3538 assert(a == CodeList(3, 37, 40, 60)); 3539 3540 a = x; 3541 a.add(37, 65); 3542 assert(a == CodeList(10, 20, 37, 65)); 3543 3544 // some tests on helpers for set intersection 3545 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3546 a = x; 3547 3548 auto m = a.skipUpTo(60); 3549 a.dropUpTo(110, m); 3550 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3551 3552 a = x; 3553 a.dropUpTo(100); 3554 assert(a == CodeList(100, 120), text(a.data[])); 3555 3556 a = x; 3557 m = a.skipUpTo(50); 3558 a.dropUpTo(140, m); 3559 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3560 a = x; 3561 a.dropUpTo(60); 3562 assert(a == CodeList(100, 120), text(a.data[])); 3563 } 3564 } 3565 3566 3567 //test constructor to work with any order of intervals 3568 pure @safe unittest 3569 { 3570 import std.algorithm.comparison : equal; 3571 import std.conv : text, to; 3572 import std.range : chain, iota; 3573 import std.typecons : tuple; 3574 //ensure constructor handles bad ordering and overlap 3575 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3576 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3577 assert(ch in c1, to!string(ch)); 3578 3579 //contiguos 3580 assert(CodepointSet(1000, 1006, 1006, 1009) 3581 .byInterval.equal([tuple(1000, 1009)])); 3582 //contains 3583 assert(CodepointSet(900, 1200, 1000, 1100) 3584 .byInterval.equal([tuple(900, 1200)])); 3585 //intersect left 3586 assert(CodepointSet(900, 1100, 1000, 1200) 3587 .byInterval.equal([tuple(900, 1200)])); 3588 //intersect right 3589 assert(CodepointSet(1000, 1200, 900, 1100) 3590 .byInterval.equal([tuple(900, 1200)])); 3591 3592 //ditto with extra items at end 3593 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3594 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3595 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3596 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3597 3598 //"plug a hole" test 3599 auto c2 = CodepointSet(20, 40, 3600 60, 80, 100, 140, 150, 200, 3601 40, 60, 80, 100, 140, 150 3602 ); 3603 assert(c2.byInterval.equal([tuple(20, 200)])); 3604 3605 auto c3 = CodepointSet( 3606 20, 40, 60, 80, 100, 140, 150, 200, 3607 0, 10, 15, 100, 10, 20, 200, 220); 3608 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3609 } 3610 3611 3612 pure @safe unittest 3613 { // full set operations 3614 import std.conv : text; 3615 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3616 foreach (CodeList; AllSets) 3617 { 3618 CodeList a, b, c, d; 3619 3620 //"plug a hole" 3621 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3622 b.add(40, 60).add(80, 100).add(140, 150); 3623 c = a | b; 3624 d = b | a; 3625 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3626 assert(c == d, text(c," vs ", d)); 3627 3628 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3629 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3630 d = b | a; 3631 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3632 assert(c == d, text(c," vs ", d)); 3633 3634 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3635 c = a | b;//[10, 140) [145, 200) 3636 d = b | a; 3637 assert(c == CodeList(10, 140, 145, 200)); 3638 assert(c == d, text(c," vs ", d)); 3639 3640 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3641 c = a | b;//[0, 140) [150, 220) 3642 d = b | a; 3643 assert(c == CodeList(0, 140, 150, 220)); 3644 assert(c == d, text(c," vs ", d)); 3645 3646 3647 a = CodeList.init.add(20, 40).add(60, 80); 3648 b = CodeList.init.add(25, 35).add(65, 75); 3649 c = a & b; 3650 d = b & a; 3651 assert(c == CodeList(25, 35, 65, 75), text(c)); 3652 assert(c == d, text(c," vs ", d)); 3653 3654 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3655 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3656 c = a & b; 3657 d = b & a; 3658 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3659 assert(c == d, text(c," vs ", d)); 3660 3661 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3662 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3663 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3664 d = b & a; 3665 3666 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3667 assert(c == d, text(c, " vs ",d)); 3668 assert((c & a) == c); 3669 assert((d & b) == d); 3670 assert((c & d) == d); 3671 3672 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3673 c = a & b; 3674 d = b & a; 3675 assert(c == CodeList(150, 200), text(c)); 3676 assert(c == d, text(c, " vs ",d)); 3677 assert((c & a) == c); 3678 assert((d & b) == d); 3679 assert((c & d) == d); 3680 3681 assert((a & a) == a); 3682 assert((b & b) == b); 3683 3684 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3685 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3686 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3687 d = b - a;// [40, 60) [80, 100) [200, 300) 3688 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3689 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3690 assert(c - d == c, text(c-d, " vs ", c)); 3691 assert(d - c == d, text(d-c, " vs ", d)); 3692 assert(c - c == CodeList.init); 3693 assert(d - d == CodeList.init); 3694 3695 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3696 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3697 c = a - b;// [160, 190) 3698 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3699 assert(c == CodeList(160, 190), text(c)); 3700 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3701 assert(c - d == c, text(c-d, " vs ", c)); 3702 assert(d - c == d, text(d-c, " vs ", d)); 3703 assert(c - c == CodeList.init); 3704 assert(d - d == CodeList.init); 3705 3706 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3707 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3708 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3709 d = b ~ a; 3710 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3711 text(c)); 3712 assert(c == d, text(c, " vs ", d)); 3713 } 3714 } 3715 3716 } 3717 3718 pure @safe unittest// vs single dchar 3719 { 3720 import std.conv : text; 3721 CodepointSet a = CodepointSet(10, 100, 120, 200); 3722 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3723 assert((a & 'B') == CodepointSet(66, 67)); 3724 } 3725 3726 pure @safe unittest// iteration & opIndex 3727 { 3728 import std.algorithm.comparison : equal; 3729 import std.conv : text; 3730 import std.typecons : tuple, Tuple; 3731 3732 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3733 {{ 3734 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3735 auto a = CodeList('A','N','a', 'n'); 3736 assert(equal(a.byInterval, 3737 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3738 ), text(a.byInterval)); 3739 3740 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3741 version (bug8949) 3742 { 3743 import std.range : retro; 3744 assert(equal(retro(a.byInterval), 3745 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3746 ), text(retro(a.byInterval))); 3747 } 3748 auto achr = a.byCodepoint; 3749 assert(equal(achr, arr), text(a.byCodepoint)); 3750 foreach (ch; a.byCodepoint) 3751 assert(a[ch]); 3752 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3753 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3754 foreach (ch; x.byCodepoint) 3755 assert(x[ch]); 3756 static if (is(CodeList == CodepointSet)) 3757 { 3758 auto y = CodeList(x.byInterval); 3759 assert(equal(x.byInterval, y.byInterval)); 3760 } 3761 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3762 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3763 }} 3764 } 3765 3766 //============================================================================ 3767 // Generic Trie template and various ways to build it 3768 //============================================================================ 3769 3770 // debug helper to get a shortened array dump 3771 auto arrayRepr(T)(T x) 3772 { 3773 import std.conv : text; 3774 if (x.length > 32) 3775 { 3776 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3777 } 3778 else 3779 return text(x); 3780 } 3781 3782 /** 3783 Maps `Key` to a suitable integer index within the range of `size_t`. 3784 The mapping is constructed by applying predicates from `Prefix` left to right 3785 and concatenating the resulting bits. 3786 3787 The first (leftmost) predicate defines the most significant bits of 3788 the resulting index. 3789 */ 3790 template mapTrieIndex(Prefix...) 3791 { 3792 size_t mapTrieIndex(Key)(Key key) 3793 if (isValidPrefixForTrie!(Key, Prefix)) 3794 { 3795 alias p = Prefix; 3796 size_t idx; 3797 foreach (i, v; p[0..$-1]) 3798 { 3799 idx |= p[i](key); 3800 idx <<= p[i+1].bitSize; 3801 } 3802 idx |= p[$-1](key); 3803 return idx; 3804 } 3805 } 3806 3807 /* 3808 `TrieBuilder` is a type used for incremental construction 3809 of $(LREF Trie)s. 3810 3811 See $(LREF buildTrie) for generic helpers built on top of it. 3812 */ 3813 @trusted private struct TrieBuilder(Value, Key, Args...) 3814 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3815 { 3816 import std.exception : enforce; 3817 3818 private: 3819 // last index is not stored in table, it is used as an offset to values in a block. 3820 static if (is(Value == bool))// always pack bool 3821 alias V = BitPacked!(Value, 1); 3822 else 3823 alias V = Value; 3824 static auto deduceMaxIndex(Preds...)() 3825 { 3826 size_t idx = 1; 3827 foreach (v; Preds) 3828 idx *= 2^^v.bitSize; 3829 return idx; 3830 } 3831 3832 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3833 { 3834 alias Prefix = Args[1..$]; 3835 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3836 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3837 enum roughedMaxIndex = 3838 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3839 // check warp around - if wrapped, use the default deduction rule 3840 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3841 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3842 } 3843 else 3844 { 3845 alias Prefix = Args; 3846 enum maxIndex = deduceMaxIndex!(Prefix)(); 3847 } 3848 3849 alias getIndex = mapTrieIndex!(Prefix); 3850 3851 enum lastLevel = Prefix.length-1; 3852 struct ConstructState 3853 { 3854 size_t idx_zeros, idx_ones; 3855 } 3856 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3857 size_t[Prefix.length] indices; 3858 // default filler value to use 3859 Value defValue; 3860 // this is a full-width index of next item 3861 size_t curIndex; 3862 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3863 ConstructState[Prefix.length] state; 3864 // the table being constructed 3865 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3866 3867 @disable this(); 3868 3869 //shortcut for index variable at level 'level' 3870 @property ref idx(size_t level)(){ return indices[level]; } 3871 3872 // this function assumes no holes in the input so 3873 // indices are going one by one 3874 void addValue(size_t level, T)(T val, size_t numVals) 3875 { 3876 alias j = idx!level; 3877 enum pageSize = 1 << Prefix[level].bitSize; 3878 if (numVals == 0) 3879 return; 3880 auto ptr = table.slice!(level); 3881 if (numVals == 1) 3882 { 3883 static if (level == Prefix.length-1) 3884 ptr[j] = val; 3885 else 3886 {// can incur narrowing conversion 3887 assert(j < ptr.length); 3888 ptr[j] = force!(typeof(ptr[j]))(val); 3889 } 3890 j++; 3891 if (j % pageSize == 0) 3892 spillToNextPage!level(ptr); 3893 return; 3894 } 3895 // longer row of values 3896 // get to the next page boundary 3897 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3898 immutable n = nextPB - j;// can fill right in this page 3899 if (numVals < n) //fits in current page 3900 { 3901 ptr[j .. j+numVals] = val; 3902 j += numVals; 3903 return; 3904 } 3905 static if (level != 0)//on the first level it always fits 3906 { 3907 numVals -= n; 3908 //write till the end of current page 3909 ptr[j .. j+n] = val; 3910 j += n; 3911 //spill to the next page 3912 spillToNextPage!level(ptr); 3913 // page at once loop 3914 if (state[level].idx_zeros != size_t.max && val == T.init) 3915 { 3916 alias NextIdx = typeof(table.slice!(level-1)[0]); 3917 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3918 numVals/pageSize); 3919 ptr = table.slice!level; //table structure might have changed 3920 numVals %= pageSize; 3921 } 3922 else 3923 { 3924 while (numVals >= pageSize) 3925 { 3926 numVals -= pageSize; 3927 ptr[j .. j+pageSize] = val; 3928 j += pageSize; 3929 spillToNextPage!level(ptr); 3930 } 3931 } 3932 if (numVals) 3933 { 3934 // the leftovers, an incomplete page 3935 ptr[j .. j+numVals] = val; 3936 j += numVals; 3937 } 3938 } 3939 } 3940 3941 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3942 { 3943 // last level (i.e. topmost) has 1 "page" 3944 // thus it need not to add a new page on upper level 3945 static if (level != 0) 3946 spillToNextPageImpl!(level)(ptr); 3947 } 3948 3949 // this can re-use the current page if duplicate or allocate a new one 3950 // it also makes sure that previous levels point to the correct page in this level 3951 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3952 { 3953 alias NextIdx = typeof(table.slice!(level-1)[0]); 3954 NextIdx next_lvl_index; 3955 enum pageSize = 1 << Prefix[level].bitSize; 3956 assert(idx!level % pageSize == 0); 3957 immutable last = idx!level-pageSize; 3958 const slice = ptr[idx!level - pageSize .. idx!level]; 3959 size_t j; 3960 for (j=0; j<last; j+=pageSize) 3961 { 3962 if (ptr[j .. j+pageSize] == slice) 3963 { 3964 // get index to it, reuse ptr space for the next block 3965 next_lvl_index = force!NextIdx(j/pageSize); 3966 version (none) 3967 { 3968 import std.stdio : writefln, writeln; 3969 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3970 ,level 3971 ,indices[level-1], pageSize, j, j+pageSize); 3972 writeln("LEVEL(", level 3973 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3974 writeln("LEVEL(", level 3975 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3976 } 3977 idx!level -= pageSize; // reuse this page, it is duplicate 3978 break; 3979 } 3980 } 3981 if (j == last) 3982 { 3983 L_allocate_page: 3984 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3985 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3986 { 3987 state[level].idx_zeros = next_lvl_index; 3988 } 3989 // allocate next page 3990 version (none) 3991 { 3992 import std.stdio : writefln; 3993 writefln("LEVEL(%s) page allocated: %s" 3994 , level, arrayRepr(slice[0 .. pageSize])); 3995 writefln("LEVEL(%s) index: %s ; page at this index %s" 3996 , level 3997 , next_lvl_index 3998 , arrayRepr( 3999 table.slice!(level) 4000 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4001 )); 4002 } 4003 table.length!level = table.length!level + pageSize; 4004 } 4005 L_know_index: 4006 // for the previous level, values are indices to the pages in the current level 4007 addValue!(level-1)(next_lvl_index, 1); 4008 ptr = table.slice!level; //re-load the slice after moves 4009 } 4010 4011 // idx - full-width index to fill with v (full-width index != key) 4012 // fills everything in the range of [curIndex, idx) with filler 4013 void putAt(size_t idx, Value v) 4014 { 4015 assert(idx >= curIndex); 4016 immutable numFillers = idx - curIndex; 4017 addValue!lastLevel(defValue, numFillers); 4018 addValue!lastLevel(v, 1); 4019 curIndex = idx + 1; 4020 } 4021 4022 // ditto, but sets the range of [idxA, idxB) to v 4023 void putRangeAt(size_t idxA, size_t idxB, Value v) 4024 { 4025 assert(idxA >= curIndex); 4026 assert(idxB >= idxA); 4027 size_t numFillers = idxA - curIndex; 4028 addValue!lastLevel(defValue, numFillers); 4029 addValue!lastLevel(v, idxB - idxA); 4030 curIndex = idxB; // open-right 4031 } 4032 4033 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4034 "duplicate key->value mapping"; 4035 4036 public: 4037 /** 4038 Construct a builder, where `filler` is a value 4039 to indicate empty slots (or "not found" condition). 4040 */ 4041 this(Value filler) 4042 { 4043 curIndex = 0; 4044 defValue = filler; 4045 // zeros-page index, ones-page index 4046 foreach (ref v; state) 4047 v = ConstructState(size_t.max, size_t.max); 4048 table = typeof(table)(indices); 4049 // one page per level is a bootstrap minimum 4050 foreach (i, Pred; Prefix) 4051 table.length!i = (1 << Pred.bitSize); 4052 } 4053 4054 /** 4055 Put a value `v` into interval as 4056 mapped by keys from `a` to `b`. 4057 All slots prior to `a` are filled with 4058 the default filler. 4059 */ 4060 void putRange(Key a, Key b, Value v) 4061 { 4062 auto idxA = getIndex(a), idxB = getIndex(b); 4063 // indexes of key should always grow 4064 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4065 putRangeAt(idxA, idxB, v); 4066 } 4067 4068 /** 4069 Put a value `v` into slot mapped by `key`. 4070 All slots prior to `key` are filled with the 4071 default filler. 4072 */ 4073 void putValue(Key key, Value v) 4074 { 4075 auto idx = getIndex(key); 4076 enforce(idx >= curIndex, errMsg); 4077 putAt(idx, v); 4078 } 4079 4080 /// Finishes construction of Trie, yielding an immutable Trie instance. 4081 auto build() 4082 { 4083 static if (maxIndex != 0) // doesn't cover full range of size_t 4084 { 4085 assert(curIndex <= maxIndex); 4086 addValue!lastLevel(defValue, maxIndex - curIndex); 4087 } 4088 else 4089 { 4090 if (curIndex != 0 // couldn't wrap around 4091 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4092 { 4093 addValue!lastLevel(defValue, size_t.max - curIndex); 4094 addValue!lastLevel(defValue, 1); 4095 } 4096 // else curIndex already completed the full range of size_t by wrapping around 4097 } 4098 return Trie!(V, Key, maxIndex, Prefix)(table); 4099 } 4100 } 4101 4102 /** 4103 $(P A generic Trie data-structure for a fixed number of stages. 4104 The design goal is optimal speed with smallest footprint size. 4105 ) 4106 $(P It's intentionally read-only and doesn't provide constructors. 4107 To construct one use a special builder, 4108 see $(LREF TrieBuilder) and $(LREF buildTrie). 4109 ) 4110 4111 */ 4112 @trusted private struct Trie(Value, Key, Args...) 4113 if (isValidPrefixForTrie!(Key, Args) 4114 || (isValidPrefixForTrie!(Key, Args[1..$]) 4115 && is(typeof(Args[0]) : size_t))) 4116 { 4117 import std.range.primitives : isOutputRange; 4118 static if (is(typeof(Args[0]) : size_t)) 4119 { 4120 private enum maxIndex = Args[0]; 4121 private enum hasBoundsCheck = true; 4122 private alias Prefix = Args[1..$]; 4123 } 4124 else 4125 { 4126 private enum hasBoundsCheck = false; 4127 private alias Prefix = Args; 4128 } 4129 4130 private this()(typeof(_table) table) 4131 { 4132 _table = table; 4133 } 4134 4135 // only for constant Tries constructed from precompiled tables 4136 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4137 const(size_t)[] data) const 4138 { 4139 _table = typeof(_table)(offsets, sizes, data); 4140 } 4141 4142 /** 4143 $(P Lookup the `key` in this `Trie`. ) 4144 4145 $(P The lookup always succeeds if key fits the domain 4146 provided during construction. The whole domain defined 4147 is covered so instead of not found condition 4148 the sentinel (filler) value could be used. ) 4149 4150 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4151 define a domain of `Trie` keys and the sentinel value. ) 4152 4153 Note: 4154 Domain range-checking is only enabled in debug builds 4155 and results in assertion failure. 4156 */ 4157 TypeOfBitPacked!Value opIndex()(Key key) const 4158 { 4159 static if (hasBoundsCheck) 4160 assert(mapTrieIndex!Prefix(key) < maxIndex); 4161 size_t idx; 4162 alias p = Prefix; 4163 idx = cast(size_t) p[0](key); 4164 foreach (i, v; p[0..$-1]) 4165 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4166 return _table.ptr!(p.length-1)[idx]; 4167 } 4168 4169 /// 4170 @property size_t bytes(size_t n=size_t.max)() const 4171 { 4172 return _table.bytes!n; 4173 } 4174 4175 /// 4176 @property size_t pages(size_t n)() const 4177 { 4178 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4179 /2^^Prefix[n].bitSize; 4180 } 4181 4182 /// 4183 void store(OutRange)(scope OutRange sink) const 4184 if (isOutputRange!(OutRange, char)) 4185 { 4186 _table.store(sink); 4187 } 4188 4189 private: 4190 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4191 } 4192 4193 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4194 // left-to-right, the most significant bits first 4195 template GetBitSlicing(size_t top, sizes...) 4196 { 4197 static if (sizes.length > 0) 4198 alias GetBitSlicing = 4199 AliasSeq!(sliceBits!(top - sizes[0], top), 4200 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4201 else 4202 alias GetBitSlicing = AliasSeq!(); 4203 } 4204 4205 template callableWith(T) 4206 { 4207 template callableWith(alias Pred) 4208 { 4209 static if (!is(typeof(Pred(T.init)))) 4210 enum callableWith = false; 4211 else 4212 { 4213 alias Result = typeof(Pred(T.init)); 4214 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4215 } 4216 } 4217 } 4218 4219 /* 4220 Check if `Prefix` is a valid set of predicates 4221 for `Trie` template having `Key` as the type of keys. 4222 This requires all predicates to be callable, take 4223 single argument of type `Key` and return unsigned value. 4224 */ 4225 template isValidPrefixForTrie(Key, Prefix...) 4226 { 4227 import std.meta : allSatisfy; 4228 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4229 } 4230 4231 /* 4232 Check if `Args` is a set of maximum key value followed by valid predicates 4233 for `Trie` template having `Key` as the type of keys. 4234 */ 4235 template isValidArgsForTrie(Key, Args...) 4236 { 4237 static if (Args.length > 1) 4238 { 4239 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4240 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4241 } 4242 else 4243 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4244 } 4245 4246 @property size_t sumOfIntegerTuple(ints...)() 4247 { 4248 size_t count=0; 4249 foreach (v; ints) 4250 count += v; 4251 return count; 4252 } 4253 4254 /** 4255 A shorthand for creating a custom multi-level fixed Trie 4256 from a `CodepointSet`. `sizes` are numbers of bits per level, 4257 with the most significant bits used first. 4258 4259 Note: The sum of `sizes` must be equal 21. 4260 4261 See_Also: $(LREF toTrie), which is even simpler. 4262 4263 Example: 4264 --- 4265 { 4266 import std.stdio; 4267 auto set = unicode("Number"); 4268 auto trie = codepointSetTrie!(8, 5, 8)(set); 4269 writeln("Input code points to test:"); 4270 foreach (line; stdin.byLine) 4271 { 4272 int count=0; 4273 foreach (dchar ch; line) 4274 if (trie[ch])// is number 4275 count++; 4276 writefln("Contains %d number code points.", count); 4277 } 4278 } 4279 --- 4280 */ 4281 public template codepointSetTrie(sizes...) 4282 if (sumOfIntegerTuple!sizes == 21) 4283 { 4284 auto codepointSetTrie(Set)(Set set) 4285 if (isCodepointSet!Set) 4286 { 4287 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4288 foreach (ival; set.byInterval) 4289 builder.putRange(ival[0], ival[1], true); 4290 return builder.build(); 4291 } 4292 } 4293 4294 /// Type of Trie generated by codepointSetTrie function. 4295 public template CodepointSetTrie(sizes...) 4296 if (sumOfIntegerTuple!sizes == 21) 4297 { 4298 alias Prefix = GetBitSlicing!(21, sizes); 4299 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4300 } 4301 4302 /** 4303 A slightly more general tool for building fixed `Trie` 4304 for the Unicode data. 4305 4306 Specifically unlike `codepointSetTrie` it's allows creating mappings 4307 of `dchar` to an arbitrary type `T`. 4308 4309 Note: Overload taking `CodepointSet`s will naturally convert 4310 only to bool mapping `Trie`s. 4311 4312 CodepointTrie is the type of Trie as generated by codepointTrie function. 4313 */ 4314 public template codepointTrie(T, sizes...) 4315 if (sumOfIntegerTuple!sizes == 21) 4316 { 4317 alias Prefix = GetBitSlicing!(21, sizes); 4318 4319 static if (is(TypeOfBitPacked!T == bool)) 4320 { 4321 auto codepointTrie(Set)(const scope Set set) 4322 if (isCodepointSet!Set) 4323 { 4324 return codepointSetTrie(set); 4325 } 4326 } 4327 4328 /// 4329 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4330 { 4331 return buildTrie!(T, dchar, Prefix)(map, defValue); 4332 } 4333 4334 // unsorted range of pairs 4335 /// 4336 auto codepointTrie(R)(R range, T defValue=T.init) 4337 if (isInputRange!R 4338 && is(typeof(ElementType!R.init[0]) : T) 4339 && is(typeof(ElementType!R.init[1]) : dchar)) 4340 { 4341 // build from unsorted array of pairs 4342 // TODO: expose index sorting functions for Trie 4343 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4344 } 4345 } 4346 4347 @system pure unittest 4348 { 4349 import std.algorithm.comparison : max; 4350 import std.algorithm.searching : count; 4351 4352 // pick characters from the Greek script 4353 auto set = unicode.Greek; 4354 4355 // a user-defined property (or an expensive function) 4356 // that we want to look up 4357 static uint luckFactor(dchar ch) 4358 { 4359 // here we consider a character lucky 4360 // if its code point has a lot of identical hex-digits 4361 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4362 ubyte[6] nibbles; // 6 4-bit chunks of code point 4363 uint value = ch; 4364 foreach (i; 0 .. 6) 4365 { 4366 nibbles[i] = value & 0xF; 4367 value >>= 4; 4368 } 4369 uint luck; 4370 foreach (n; nibbles) 4371 luck = cast(uint) max(luck, count(nibbles[], n)); 4372 return luck; 4373 } 4374 4375 // only unsigned built-ins are supported at the moment 4376 alias LuckFactor = BitPacked!(uint, 3); 4377 4378 // create a temporary associative array (AA) 4379 LuckFactor[dchar] map; 4380 foreach (ch; set.byCodepoint) 4381 map[ch] = LuckFactor(luckFactor(ch)); 4382 4383 // bits per stage are chosen randomly, fell free to optimize 4384 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4385 4386 // from now on the AA is not needed 4387 foreach (ch; set.byCodepoint) 4388 assert(trie[ch] == luckFactor(ch)); // verify 4389 // CJK is not Greek, thus it has the default value 4390 assert(trie['\u4444'] == 0); 4391 // and here is a couple of quite lucky Greek characters: 4392 // Greek small letter epsilon with dasia 4393 assert(trie['\u1F11'] == 3); 4394 // Ancient Greek metretes sign 4395 assert(trie['\U00010181'] == 3); 4396 4397 } 4398 4399 /// ditto 4400 public template CodepointTrie(T, sizes...) 4401 if (sumOfIntegerTuple!sizes == 21) 4402 { 4403 alias Prefix = GetBitSlicing!(21, sizes); 4404 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4405 } 4406 4407 package(std) template cmpK0(alias Pred) 4408 { 4409 import std.typecons : Tuple; 4410 static bool cmpK0(Value, Key) 4411 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4412 { 4413 return Pred(a[1]) < Pred(b[1]); 4414 } 4415 } 4416 4417 /** 4418 The most general utility for construction of `Trie`s 4419 short of using `TrieBuilder` directly. 4420 4421 Provides a number of convenience overloads. 4422 `Args` is tuple of maximum key value followed by 4423 predicates to construct index from key. 4424 4425 Alternatively if the first argument is not a value convertible to `Key` 4426 then the whole tuple of `Args` is treated as predicates 4427 and the maximum Key is deduced from predicates. 4428 */ 4429 private template buildTrie(Value, Key, Args...) 4430 if (isValidArgsForTrie!(Key, Args)) 4431 { 4432 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4433 { 4434 alias Prefix = Args[1..$]; 4435 } 4436 else 4437 alias Prefix = Args; 4438 4439 alias getIndex = mapTrieIndex!(Prefix); 4440 4441 // for multi-sort 4442 template GetComparators(size_t n) 4443 { 4444 static if (n > 0) 4445 alias GetComparators = 4446 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4447 else 4448 alias GetComparators = AliasSeq!(); 4449 } 4450 4451 /* 4452 Build `Trie` from a range of a Key-Value pairs, 4453 assuming it is sorted by Key as defined by the following lambda: 4454 ------ 4455 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4456 ------ 4457 Exception is thrown if it's detected that the above order doesn't hold. 4458 4459 In other words $(LREF mapTrieIndex) should be a 4460 monotonically increasing function that maps `Key` to an integer. 4461 4462 See_Also: $(REF sort, std,_algorithm), 4463 $(REF SortedRange, std,range), 4464 $(REF setUnion, std,_algorithm). 4465 */ 4466 auto buildTrie(Range)(Range range, Value filler=Value.init) 4467 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4468 && is(typeof(Range.init.front[1]) : Key)) 4469 { 4470 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4471 foreach (v; range) 4472 builder.putValue(v[1], v[0]); 4473 return builder.build(); 4474 } 4475 4476 /* 4477 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4478 to build `Trie` from a range of open-right intervals of `Key`s. 4479 The requirement on the ordering of keys (and the behavior on the 4480 violation of it) is the same as for Key-Value range overload. 4481 4482 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4483 If no filler provided keys inside of the intervals map to true, 4484 and `filler` is false. 4485 */ 4486 auto buildTrie(Range)(Range range, Value filler=Value.init) 4487 if (is(TypeOfBitPacked!Value == bool) 4488 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4489 && is(typeof(Range.init.front[1]) : Key)) 4490 { 4491 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4492 foreach (ival; range) 4493 builder.putRange(ival[0], ival[1], !filler); 4494 return builder.build(); 4495 } 4496 4497 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4498 if (isInputRange!Range 4499 && is(typeof(Range.init.front[0]) : Value) 4500 && is(typeof(Range.init.front[1]) : Key)) 4501 { 4502 import std.algorithm.sorting : multiSort; 4503 alias Comps = GetComparators!(Prefix.length); 4504 if (unsorted) 4505 multiSort!(Comps)(range); 4506 return buildTrie(range, filler); 4507 } 4508 4509 /* 4510 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4511 to build `Trie` simply from an input range of `Key`s. 4512 The requirement on the ordering of keys (and the behavior on the 4513 violation of it) is the same as for Key-Value range overload. 4514 4515 Keys found in range denote !`filler` i.e. the opposite of filler. 4516 If no filler provided keys map to true, and `filler` is false. 4517 */ 4518 auto buildTrie(Range)(Range range, Value filler=Value.init) 4519 if (is(TypeOfBitPacked!Value == bool) 4520 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4521 { 4522 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4523 foreach (v; range) 4524 builder.putValue(v, !filler); 4525 return builder.build(); 4526 } 4527 4528 /* 4529 If `Key` is unsigned integer `Trie` could be constructed from array 4530 of values where array index serves as key. 4531 */ 4532 auto buildTrie()(Value[] array, Value filler=Value.init) 4533 if (isUnsigned!Key) 4534 { 4535 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4536 foreach (idx, v; array) 4537 builder.putValue(idx, v); 4538 return builder.build(); 4539 } 4540 4541 /* 4542 Builds `Trie` from associative array. 4543 */ 4544 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4545 { 4546 import std.array : array; 4547 import std.range : zip; 4548 auto range = array(zip(map.values, map.keys)); 4549 return buildTrie(range, filler, true); // sort it 4550 } 4551 } 4552 4553 // helper in place of assumeSize to 4554 //reduce mangled name & help DMD inline Trie functors 4555 struct clamp(size_t bits) 4556 { 4557 static size_t opCall(T)(T arg){ return arg; } 4558 enum bitSize = bits; 4559 } 4560 4561 struct clampIdx(size_t idx, size_t bits) 4562 { 4563 static size_t opCall(T)(T arg){ return arg[idx]; } 4564 enum bitSize = bits; 4565 } 4566 4567 /** 4568 Conceptual type that outlines the common properties of all UTF Matchers. 4569 4570 Note: For illustration purposes only, every method 4571 call results in assertion failure. 4572 Use $(LREF utfMatcher) to obtain a concrete matcher 4573 for UTF-8 or UTF-16 encodings. 4574 */ 4575 public struct MatcherConcept 4576 { 4577 /** 4578 $(P Perform a semantic equivalent 2 operations: 4579 decoding a $(CODEPOINT) at front of `inp` and testing if 4580 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4581 4582 $(P The effect on `inp` depends on the kind of function called:) 4583 4584 $(P Match. If the codepoint is found in the set then range `inp` 4585 is advanced by its size in $(S_LINK Code unit, code units), 4586 otherwise the range is not modifed.) 4587 4588 $(P Skip. The range is always advanced by the size 4589 of the tested $(CODEPOINT) regardless of the result of test.) 4590 4591 $(P Test. The range is left unaffected regardless 4592 of the result of test.) 4593 */ 4594 public bool match(Range)(ref Range inp) 4595 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4596 { 4597 assert(false); 4598 } 4599 4600 ///ditto 4601 public bool skip(Range)(ref Range inp) 4602 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4603 { 4604 assert(false); 4605 } 4606 4607 ///ditto 4608 public bool test(Range)(ref Range inp) 4609 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4610 { 4611 assert(false); 4612 } 4613 /// 4614 pure @safe unittest 4615 { 4616 string truth = "2² = 4"; 4617 auto m = utfMatcher!char(unicode.Number); 4618 assert(m.match(truth)); // '2' is a number all right 4619 assert(truth == "² = 4"); // skips on match 4620 assert(m.match(truth)); // so is the superscript '2' 4621 assert(!m.match(truth)); // space is not a number 4622 assert(truth == " = 4"); // unaffected on no match 4623 assert(!m.skip(truth)); // same test ... 4624 assert(truth == "= 4"); // but skips a codepoint regardless 4625 assert(!m.test(truth)); // '=' is not a number 4626 assert(truth == "= 4"); // test never affects argument 4627 } 4628 4629 /** 4630 Advanced feature - provide direct access to a subset of matcher based a 4631 set of known encoding lengths. Lengths are provided in 4632 $(S_LINK Code unit, code units). The sub-matcher then may do less 4633 operations per any `test`/`match`. 4634 4635 Use with care as the sub-matcher won't match 4636 any $(CODEPOINTS) that have encoded length that doesn't belong 4637 to the selected set of lengths. Also the sub-matcher object references 4638 the parent matcher and must not be used past the liftetime 4639 of the latter. 4640 4641 Another caveat of using sub-matcher is that skip is not available 4642 preciesly because sub-matcher doesn't detect all lengths. 4643 */ 4644 @property auto subMatcher(Lengths...)() 4645 { 4646 assert(0); 4647 return this; 4648 } 4649 4650 pure @safe unittest 4651 { 4652 auto m = utfMatcher!char(unicode.Number); 4653 string square = "2²"; 4654 // about sub-matchers 4655 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4656 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4657 assert(!m.subMatcher!1.test(square)); // unicode '²' 4658 assert(m.subMatcher!(2,3,4).match(square)); // 4659 assert(square == ""); 4660 wstring wsquare = "2²"; 4661 auto m16 = utfMatcher!wchar(unicode.Number); 4662 // may keep ref, but the orignal (m16) must be kept alive 4663 auto bmp = m16.subMatcher!1; 4664 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4665 assert(bmp.match(wsquare)); // And '²' too 4666 } 4667 } 4668 4669 /** 4670 Test if `M` is an UTF Matcher for ranges of `Char`. 4671 */ 4672 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4673 C[] s; 4674 auto d = s.decoder; 4675 M m; 4676 assert(is(typeof(m.match(d)) == bool)); 4677 assert(is(typeof(m.test(d)) == bool)); 4678 static if (is(typeof(m.skip(d)))) 4679 { 4680 assert(is(typeof(m.skip(d)) == bool)); 4681 assert(is(typeof(m.skip(s)) == bool)); 4682 } 4683 assert(is(typeof(m.match(s)) == bool)); 4684 assert(is(typeof(m.test(s)) == bool)); 4685 }); 4686 4687 pure @safe unittest 4688 { 4689 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4690 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4691 static assert(isUtfMatcher!(CharMatcher, char)); 4692 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4693 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4694 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4695 } 4696 4697 enum Mode { 4698 alwaysSkip, 4699 neverSkip, 4700 skipOnMatch 4701 } 4702 4703 mixin template ForwardStrings() 4704 { 4705 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4706 { 4707 import std.utf : byCodeUnit; 4708 alias type = typeof(byCodeUnit(str)); 4709 return mixin(fn~"(*cast(type*)&str)"); 4710 } 4711 } 4712 4713 template Utf8Matcher() 4714 { 4715 enum validSize(int sz) = sz >= 1 && sz <= 4; 4716 4717 void badEncoding() pure @safe 4718 { 4719 import std.utf : UTFException; 4720 throw new UTFException("Invalid UTF-8 sequence"); 4721 } 4722 4723 //for 1-stage ASCII 4724 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4725 //for 2-stage lookup of 2 byte UTF-8 sequences 4726 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4727 clampIdx!(0, 5), clampIdx!(1, 6)); 4728 //ditto for 3 byte 4729 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4730 clampIdx!(0, 4), 4731 clampIdx!(1, 6), 4732 clampIdx!(2, 6) 4733 ); 4734 //ditto for 4 byte 4735 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4736 clampIdx!(0, 3), clampIdx!(1, 6), 4737 clampIdx!(2, 6), clampIdx!(3, 6) 4738 ); 4739 alias Tables = AliasSeq!( 4740 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4741 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4742 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4743 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4744 ); 4745 alias Table(int size) = Tables[size-1]; 4746 4747 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4748 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4749 4750 char truncate()(char ch) pure @safe 4751 { 4752 ch -= 0x80; 4753 if (ch < 0x40) 4754 { 4755 return ch; 4756 } 4757 else 4758 { 4759 badEncoding(); 4760 return cast(char) 0; 4761 } 4762 } 4763 4764 static auto encode(size_t sz)(dchar ch) 4765 if (sz > 1) 4766 { 4767 import std.utf : encodeUTF = encode; 4768 char[4] buf; 4769 encodeUTF(buf, ch); 4770 char[sz] ret; 4771 buf[0] &= leadMask!sz; 4772 foreach (n; 1 .. sz) 4773 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4774 ret[] = buf[0 .. sz]; 4775 return ret; 4776 } 4777 4778 auto build(Set)(Set set) 4779 { 4780 import std.algorithm.iteration : map; 4781 auto ascii = set & unicode.ASCII; 4782 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4783 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4784 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4785 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4786 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4787 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4788 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4789 alias Ret = Impl!(1,2,3,4); 4790 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4791 } 4792 4793 // Bootstrap UTF-8 static matcher interface 4794 // from 3 primitives: tab!(size), lookup and Sizes 4795 mixin template DefMatcher() 4796 { 4797 import std.format : format; 4798 import std.meta : Erase, staticIndexOf; 4799 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4800 alias UniSizes = Erase!(1, Sizes); 4801 4802 //generate dispatch code sequence for unicode parts 4803 static auto genDispatch() 4804 { 4805 string code; 4806 foreach (size; UniSizes) 4807 code ~= format(q{ 4808 if ((ch & ~leadMask!%d) == encMask!(%d)) 4809 return lookup!(%d, mode)(inp); 4810 else 4811 }, size, size, size); 4812 static if (Sizes.length == 4) //covers all code unit cases 4813 code ~= "{ badEncoding(); return false; }"; 4814 else 4815 code ~= "return false;"; //may be just fine but not covered 4816 return code; 4817 } 4818 enum dispatch = genDispatch(); 4819 4820 public bool match(Range)(ref Range inp) const 4821 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4822 !isDynamicArray!Range) 4823 { 4824 enum mode = Mode.skipOnMatch; 4825 assert(!inp.empty); 4826 immutable ch = inp[0]; 4827 static if (hasASCII) 4828 { 4829 if (ch < 0x80) 4830 { 4831 immutable r = tab!1[ch]; 4832 if (r) 4833 inp.popFront(); 4834 return r; 4835 } 4836 else 4837 mixin(dispatch); 4838 } 4839 else 4840 mixin(dispatch); 4841 } 4842 4843 static if (Sizes.length == 4) // can skip iff can detect all encodings 4844 { 4845 public bool skip(Range)(ref Range inp) const 4846 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4847 !isDynamicArray!Range) 4848 { 4849 enum mode = Mode.alwaysSkip; 4850 assert(!inp.empty); 4851 auto ch = inp[0]; 4852 static if (hasASCII) 4853 { 4854 if (ch < 0x80) 4855 { 4856 inp.popFront(); 4857 return tab!1[ch]; 4858 } 4859 else 4860 mixin(dispatch); 4861 } 4862 else 4863 mixin(dispatch); 4864 } 4865 } 4866 4867 public bool test(Range)(ref Range inp) const 4868 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4869 !isDynamicArray!Range) 4870 { 4871 enum mode = Mode.neverSkip; 4872 assert(!inp.empty); 4873 auto ch = inp[0]; 4874 static if (hasASCII) 4875 { 4876 if (ch < 0x80) 4877 return tab!1[ch]; 4878 else 4879 mixin(dispatch); 4880 } 4881 else 4882 mixin(dispatch); 4883 } 4884 4885 bool match(C)(ref C[] str) const 4886 if (isSomeChar!C) 4887 { 4888 return fwdStr!"match"(str); 4889 } 4890 4891 bool skip(C)(ref C[] str) const 4892 if (isSomeChar!C) 4893 { 4894 return fwdStr!"skip"(str); 4895 } 4896 4897 bool test(C)(ref C[] str) const 4898 if (isSomeChar!C) 4899 { 4900 return fwdStr!"test"(str); 4901 } 4902 4903 mixin ForwardStrings; 4904 } 4905 4906 struct Impl(Sizes...) 4907 { 4908 import std.meta : allSatisfy, staticMap; 4909 static assert(allSatisfy!(validSize, Sizes), 4910 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4911 private: 4912 //pick tables for chosen sizes 4913 alias OurTabs = staticMap!(Table, Sizes); 4914 OurTabs tables; 4915 mixin DefMatcher; 4916 //static disptach helper UTF size ==> table 4917 alias tab(int i) = tables[i - 1]; 4918 4919 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4920 { 4921 return CherryPick!(Impl, SizesToPick)(&this); 4922 } 4923 4924 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4925 { 4926 import std.range : popFrontN; 4927 if (inp.length < size) 4928 { 4929 badEncoding(); 4930 return false; 4931 } 4932 char[size] needle = void; 4933 needle[0] = leadMask!size & inp[0]; 4934 static foreach (i; 1 .. size) 4935 { 4936 needle[i] = truncate(inp[i]); 4937 } 4938 //overlong encoding checks 4939 static if (size == 2) 4940 { 4941 //0x80-0x7FF 4942 //got 6 bits in needle[1], must use at least 8 bits 4943 //must use at least 2 bits in needle[1] 4944 if (needle[0] < 2) badEncoding(); 4945 } 4946 else static if (size == 3) 4947 { 4948 //0x800-0xFFFF 4949 //got 6 bits in needle[2], must use at least 12bits 4950 //must use 6 bits in needle[1] or anything in needle[0] 4951 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4952 } 4953 else static if (size == 4) 4954 { 4955 //0x800-0xFFFF 4956 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4957 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4958 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4959 } 4960 static if (mode == Mode.alwaysSkip) 4961 { 4962 inp.popFrontN(size); 4963 return tab!size[needle]; 4964 } 4965 else static if (mode == Mode.neverSkip) 4966 { 4967 return tab!size[needle]; 4968 } 4969 else 4970 { 4971 static assert(mode == Mode.skipOnMatch); 4972 if (tab!size[needle]) 4973 { 4974 inp.popFrontN(size); 4975 return true; 4976 } 4977 else 4978 return false; 4979 } 4980 } 4981 } 4982 4983 struct CherryPick(I, Sizes...) 4984 { 4985 import std.meta : allSatisfy; 4986 static assert(allSatisfy!(validSize, Sizes), 4987 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4988 private: 4989 I* m; 4990 @property auto tab(int i)() const { return m.tables[i - 1]; } 4991 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4992 { 4993 return m.lookup!(size, mode)(inp); 4994 } 4995 mixin DefMatcher; 4996 } 4997 } 4998 4999 template Utf16Matcher() 5000 { 5001 enum validSize(int sz) = sz >= 1 && sz <= 2; 5002 5003 void badEncoding() pure @safe 5004 { 5005 import std.utf : UTFException; 5006 throw new UTFException("Invalid UTF-16 sequence"); 5007 } 5008 5009 // 1-stage ASCII 5010 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5011 //2-stage BMP 5012 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5013 //4-stage - full Unicode 5014 //assume that 0xD800 & 0xDC00 bits are cleared 5015 //thus leaving 10 bit per wchar to worry about 5016 alias UniSpec = AliasSeq!(bool, wchar[2], 5017 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5018 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5019 ); 5020 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5021 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5022 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5023 5024 auto encode2(dchar ch) 5025 { 5026 ch -= 0x1_0000; 5027 assert(ch <= 0xF_FFFF); 5028 wchar[2] ret; 5029 //do not put surrogate bits, they are sliced off 5030 ret[0] = cast(wchar)(ch >> 10); 5031 ret[1] = (ch & 0xFFF); 5032 return ret; 5033 } 5034 5035 auto build(Set)(Set set) 5036 { 5037 import std.algorithm.iteration : map; 5038 auto ascii = set & unicode.ASCII; 5039 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5040 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5041 auto other = set - (bmp | ascii); 5042 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5043 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5044 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5045 alias Ret = Impl!(1,2); 5046 return Ret(asciiT, bmpT, otherT); 5047 } 5048 5049 //bootstrap full UTF-16 matcher interace from 5050 //sizeFlags, lookupUni and ascii 5051 mixin template DefMatcher() 5052 { 5053 public bool match(Range)(ref Range inp) const 5054 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5055 !isDynamicArray!Range) 5056 { 5057 enum mode = Mode.skipOnMatch; 5058 assert(!inp.empty); 5059 immutable ch = inp[0]; 5060 static if (sizeFlags & 1) 5061 { 5062 if (ch < 0x80) 5063 { 5064 if (ascii[ch]) 5065 { 5066 inp.popFront(); 5067 return true; 5068 } 5069 else 5070 return false; 5071 } 5072 return lookupUni!mode(inp); 5073 } 5074 else 5075 return lookupUni!mode(inp); 5076 } 5077 5078 static if (Sizes.length == 2) 5079 { 5080 public bool skip(Range)(ref Range inp) const 5081 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5082 !isDynamicArray!Range) 5083 { 5084 enum mode = Mode.alwaysSkip; 5085 assert(!inp.empty); 5086 immutable ch = inp[0]; 5087 static if (sizeFlags & 1) 5088 { 5089 if (ch < 0x80) 5090 { 5091 inp.popFront(); 5092 return ascii[ch]; 5093 } 5094 else 5095 return lookupUni!mode(inp); 5096 } 5097 else 5098 return lookupUni!mode(inp); 5099 } 5100 } 5101 5102 public bool test(Range)(ref Range inp) const 5103 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5104 !isDynamicArray!Range) 5105 { 5106 enum mode = Mode.neverSkip; 5107 assert(!inp.empty); 5108 auto ch = inp[0]; 5109 static if (sizeFlags & 1) 5110 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5111 else 5112 return lookupUni!mode(inp); 5113 } 5114 5115 bool match(C)(ref C[] str) const 5116 if (isSomeChar!C) 5117 { 5118 return fwdStr!"match"(str); 5119 } 5120 5121 bool skip(C)(ref C[] str) const 5122 if (isSomeChar!C) 5123 { 5124 return fwdStr!"skip"(str); 5125 } 5126 5127 bool test(C)(ref C[] str) const 5128 if (isSomeChar!C) 5129 { 5130 return fwdStr!"test"(str); 5131 } 5132 5133 mixin ForwardStrings; //dispatch strings to range versions 5134 } 5135 5136 struct Impl(Sizes...) 5137 if (Sizes.length >= 1 && Sizes.length <= 2) 5138 { 5139 private: 5140 import std.meta : allSatisfy; 5141 static assert(allSatisfy!(validSize, Sizes), 5142 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5143 static if (Sizes.length > 1) 5144 enum sizeFlags = Sizes[0] | Sizes[1]; 5145 else 5146 enum sizeFlags = Sizes[0]; 5147 5148 static if (sizeFlags & 1) 5149 { 5150 Ascii ascii; 5151 Bmp bmp; 5152 } 5153 static if (sizeFlags & 2) 5154 { 5155 Uni uni; 5156 } 5157 mixin DefMatcher; 5158 5159 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5160 { 5161 return CherryPick!(Impl, SizesToPick)(&this); 5162 } 5163 5164 bool lookupUni(Mode mode, Range)(ref Range inp) const 5165 { 5166 wchar x = cast(wchar)(inp[0] - 0xD800); 5167 //not a high surrogate 5168 if (x > 0x3FF) 5169 { 5170 //low surrogate 5171 if (x <= 0x7FF) badEncoding(); 5172 static if (sizeFlags & 1) 5173 { 5174 auto ch = inp[0]; 5175 static if (mode == Mode.alwaysSkip) 5176 inp.popFront(); 5177 static if (mode == Mode.skipOnMatch) 5178 { 5179 if (bmp[ch]) 5180 { 5181 inp.popFront(); 5182 return true; 5183 } 5184 else 5185 return false; 5186 } 5187 else 5188 return bmp[ch]; 5189 } 5190 else //skip is not available for sub-matchers, so just false 5191 return false; 5192 } 5193 else 5194 { 5195 import std.range : popFrontN; 5196 static if (sizeFlags & 2) 5197 { 5198 if (inp.length < 2) 5199 badEncoding(); 5200 wchar y = cast(wchar)(inp[1] - 0xDC00); 5201 //not a low surrogate 5202 if (y > 0x3FF) 5203 badEncoding(); 5204 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5205 static if (mode == Mode.alwaysSkip) 5206 inp.popFrontN(2); 5207 static if (mode == Mode.skipOnMatch) 5208 { 5209 if (uni[needle]) 5210 { 5211 inp.popFrontN(2); 5212 return true; 5213 } 5214 else 5215 return false; 5216 } 5217 else 5218 return uni[needle]; 5219 } 5220 else //ditto 5221 return false; 5222 } 5223 } 5224 } 5225 5226 struct CherryPick(I, Sizes...) 5227 if (Sizes.length >= 1 && Sizes.length <= 2) 5228 { 5229 private: 5230 import std.meta : allSatisfy; 5231 I* m; 5232 enum sizeFlags = I.sizeFlags; 5233 5234 static if (sizeFlags & 1) 5235 { 5236 @property auto ascii()() const { return m.ascii; } 5237 } 5238 5239 bool lookupUni(Mode mode, Range)(ref Range inp) const 5240 { 5241 return m.lookupUni!mode(inp); 5242 } 5243 mixin DefMatcher; 5244 static assert(allSatisfy!(validSize, Sizes), 5245 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5246 } 5247 } 5248 5249 private auto utf8Matcher(Set)(Set set) 5250 { 5251 return Utf8Matcher!().build(set); 5252 } 5253 5254 private auto utf16Matcher(Set)(Set set) 5255 { 5256 return Utf16Matcher!().build(set); 5257 } 5258 5259 /** 5260 Constructs a matcher object 5261 to classify $(CODEPOINTS) from the `set` for encoding 5262 that has `Char` as code unit. 5263 5264 See $(LREF MatcherConcept) for API outline. 5265 */ 5266 public auto utfMatcher(Char, Set)(Set set) 5267 if (isCodepointSet!Set) 5268 { 5269 static if (is(Char : char)) 5270 return utf8Matcher(set); 5271 else static if (is(Char : wchar)) 5272 return utf16Matcher(set); 5273 else static if (is(Char : dchar)) 5274 static assert(false, "UTF-32 needs no decoding, 5275 and thus not supported by utfMatcher"); 5276 else 5277 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5278 } 5279 5280 5281 //a range of code units, packed with index to speed up forward iteration 5282 package(std) auto decoder(C)(C[] s, size_t offset=0) 5283 if (is(C : wchar) || is(C : char)) 5284 { 5285 static struct Decoder 5286 { 5287 pure nothrow: 5288 C[] str; 5289 size_t idx; 5290 @property C front(){ return str[idx]; } 5291 @property C back(){ return str[$-1]; } 5292 void popFront(){ idx++; } 5293 void popBack(){ str = str[0..$-1]; } 5294 void popFrontN(size_t n){ idx += n; } 5295 @property bool empty(){ return idx == str.length; } 5296 @property auto save(){ return this; } 5297 auto opIndex(size_t i){ return str[idx+i]; } 5298 @property size_t length(){ return str.length - idx; } 5299 alias opDollar = length; 5300 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5301 } 5302 static assert(isRandomAccessRange!Decoder); 5303 static assert(is(ElementType!Decoder : C)); 5304 return Decoder(s, offset); 5305 } 5306 5307 pure @safe unittest 5308 { 5309 string rs = "hi! ネемног砀 текста"; 5310 auto codec = rs.decoder; 5311 auto utf8 = utf8Matcher(unicode.Letter); 5312 auto asc = utf8.subMatcher!(1); 5313 auto uni = utf8.subMatcher!(2,3,4); 5314 assert(asc.test(codec)); 5315 assert(!uni.match(codec)); 5316 assert(utf8.skip(codec)); 5317 assert(codec.idx == 1); 5318 5319 assert(!uni.match(codec)); 5320 assert(asc.test(codec)); 5321 assert(utf8.skip(codec)); 5322 assert(codec.idx == 2); 5323 assert(!asc.match(codec)); 5324 5325 assert(!utf8.test(codec)); 5326 assert(!utf8.skip(codec)); 5327 5328 assert(!asc.test(codec)); 5329 assert(!utf8.test(codec)); 5330 assert(!utf8.skip(codec)); 5331 assert(utf8.test(codec)); 5332 foreach (i; 0 .. 7) 5333 { 5334 assert(!asc.test(codec)); 5335 assert(uni.test(codec)); 5336 assert(utf8.skip(codec)); 5337 } 5338 assert(!utf8.test(codec)); 5339 assert(!utf8.skip(codec)); 5340 //the same with match where applicable 5341 codec = rs.decoder; 5342 assert(utf8.match(codec)); 5343 assert(codec.idx == 1); 5344 assert(utf8.match(codec)); 5345 assert(codec.idx == 2); 5346 assert(!utf8.match(codec)); 5347 assert(codec.idx == 2); 5348 assert(!utf8.skip(codec)); 5349 assert(!utf8.skip(codec)); 5350 5351 foreach (i; 0 .. 7) 5352 { 5353 assert(!asc.test(codec)); 5354 assert(utf8.test(codec)); 5355 assert(utf8.match(codec)); 5356 } 5357 auto i = codec.idx; 5358 assert(!utf8.match(codec)); 5359 assert(codec.idx == i); 5360 } 5361 5362 pure @safe unittest 5363 { 5364 import std.range : stride; 5365 static bool testAll(Matcher, Range)(scope ref Matcher m, ref Range r) 5366 { 5367 bool t = m.test(r); 5368 auto save = r.idx; 5369 assert(t == m.match(r)); 5370 assert(r.idx == save || t); //ether no change or was match 5371 r.idx = save; 5372 static if (is(typeof(m.skip(r)))) 5373 { 5374 assert(t == m.skip(r)); 5375 assert(r.idx != save); //always changed 5376 r.idx = save; 5377 } 5378 return t; 5379 } 5380 auto utf16 = utfMatcher!wchar(unicode.L); 5381 auto bmp = utf16.subMatcher!1; 5382 auto nonBmp = utf16.subMatcher!1; 5383 auto utf8 = utfMatcher!char(unicode.L); 5384 auto ascii = utf8.subMatcher!1; 5385 auto uni2 = utf8.subMatcher!2; 5386 auto uni3 = utf8.subMatcher!3; 5387 auto uni24 = utf8.subMatcher!(2,4); 5388 foreach (ch; unicode.L.byCodepoint.stride(3)) 5389 { 5390 import std.utf : encode; 5391 char[4] buf; 5392 wchar[2] buf16; 5393 auto len = encode(buf, ch); 5394 auto len16 = encode(buf16, ch); 5395 auto c8 = buf[0 .. len].decoder; 5396 auto c16 = buf16[0 .. len16].decoder; 5397 assert(testAll(utf16, c16)); 5398 assert(testAll(bmp, c16) || len16 != 1); 5399 assert(testAll(nonBmp, c16) || len16 != 2); 5400 5401 assert(testAll(utf8, c8)); 5402 5403 //submatchers return false on out of their domain 5404 assert(testAll(ascii, c8) || len != 1); 5405 assert(testAll(uni2, c8) || len != 2); 5406 assert(testAll(uni3, c8) || len != 3); 5407 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5408 } 5409 } 5410 5411 // cover decode fail cases of Matcher 5412 pure @system unittest 5413 { 5414 import std.algorithm.iteration : map; 5415 import std.exception : collectException; 5416 import std.format : format; 5417 auto utf16 = utfMatcher!wchar(unicode.L); 5418 auto utf8 = utfMatcher!char(unicode.L); 5419 //decode failure cases UTF-8 5420 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5421 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5422 "\xCF\x00\0x00\0x00\x00"); 5423 foreach (msg; fails8) 5424 { 5425 assert(collectException((){ 5426 auto s = msg; 5427 size_t idx = 0; 5428 utf8.test(s); 5429 }()), format("%( %2x %)", cast(ubyte[]) msg)); 5430 } 5431 //decode failure cases UTF-16 5432 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5433 foreach (msg; fails16) 5434 { 5435 assert(collectException((){ 5436 auto s = msg.map!(x => cast(wchar) x); 5437 utf16.test(s); 5438 }())); 5439 } 5440 } 5441 5442 /++ 5443 Convenience function to construct optimal configurations for 5444 packed Trie from any `set` of $(CODEPOINTS). 5445 5446 The parameter `level` indicates the number of trie levels to use, 5447 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5448 speed-size wise. 5449 5450 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5451 $(P Level 4 is the slowest and has the smallest footprint. ) 5452 5453 See the $(S_LINK Synopsis, Synopsis) section for example. 5454 5455 Note: 5456 Level 4 stays very practical (being faster and more predictable) 5457 compared to using direct lookup on the `set` itself. 5458 5459 5460 +/ 5461 public auto toTrie(size_t level, Set)(Set set) 5462 if (isCodepointSet!Set) 5463 { 5464 static if (level == 1) 5465 return codepointSetTrie!(21)(set); 5466 else static if (level == 2) 5467 return codepointSetTrie!(10, 11)(set); 5468 else static if (level == 3) 5469 return codepointSetTrie!(8, 5, 8)(set); 5470 else static if (level == 4) 5471 return codepointSetTrie!(6, 4, 4, 7)(set); 5472 else 5473 static assert(false, 5474 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5475 } 5476 5477 /** 5478 $(P Builds a `Trie` with typically optimal speed-size trade-off 5479 and wraps it into a delegate of the following type: 5480 $(D bool delegate(dchar ch)). ) 5481 5482 $(P Effectively this creates a 'tester' lambda suitable 5483 for algorithms like std.algorithm.find that take unary predicates. ) 5484 5485 See the $(S_LINK Synopsis, Synopsis) section for example. 5486 */ 5487 public auto toDelegate(Set)(Set set) 5488 if (isCodepointSet!Set) 5489 { 5490 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5491 auto t = toTrie!3(set); 5492 return (dchar ch) => t[ch]; 5493 } 5494 5495 /** 5496 $(P Opaque wrapper around unsigned built-in integers and 5497 code unit (char/wchar/dchar) types. 5498 Parameter `sz` indicates that the value is confined 5499 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5500 packed more tightly when stored in certain 5501 data-structures like trie. ) 5502 5503 Note: 5504 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5505 but not vise-versa. Users have to ensure the value fits in 5506 the range required and use the `cast` 5507 operator to perform the conversion.) 5508 */ 5509 struct BitPacked(T, size_t sz) 5510 if (isIntegral!T || is(T:dchar)) 5511 { 5512 enum bitSize = sz; 5513 T _value; 5514 alias _value this; 5515 } 5516 5517 /* 5518 Depending on the form of the passed argument `bitSizeOf` returns 5519 the amount of bits required to represent a given type 5520 or a return type of a given functor. 5521 */ 5522 template bitSizeOf(Args...) 5523 if (Args.length == 1) 5524 { 5525 import std.traits : ReturnType; 5526 alias T = Args[0]; 5527 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5528 { 5529 enum bitSizeOf = T.bitSize; 5530 } 5531 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5532 { 5533 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5534 } 5535 else 5536 { 5537 enum bitSizeOf = T.sizeof*8; 5538 } 5539 } 5540 5541 /** 5542 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5543 and thus suitable for packing. 5544 */ 5545 template isBitPacked(T) 5546 { 5547 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5548 enum isBitPacked = true; 5549 else 5550 enum isBitPacked = false; 5551 } 5552 5553 /** 5554 Gives the type `U` from $(LREF BitPacked)!(U, x) 5555 or `T` itself for every other type. 5556 */ 5557 template TypeOfBitPacked(T) 5558 { 5559 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5560 alias TypeOfBitPacked = U; 5561 else 5562 alias TypeOfBitPacked = T; 5563 } 5564 5565 /* 5566 Wrapper, used in definition of custom data structures from `Trie` template. 5567 Applying it to a unary lambda function indicates that the returned value always 5568 fits within `bits` of bits. 5569 */ 5570 struct assumeSize(alias Fn, size_t bits) 5571 { 5572 enum bitSize = bits; 5573 static auto ref opCall(T)(auto ref T arg) 5574 { 5575 return Fn(arg); 5576 } 5577 } 5578 5579 /* 5580 A helper for defining lambda function that yields a slice 5581 of certain bits from an unsigned integral value. 5582 The resulting lambda is wrapped in assumeSize and can be used directly 5583 with `Trie` template. 5584 */ 5585 struct sliceBits(size_t from, size_t to) 5586 { 5587 //for now bypass assumeSize, DMD has trouble inlining it 5588 enum bitSize = to-from; 5589 static auto opCall(T)(T x) 5590 out(result) 5591 { 5592 assert(result < (1 << to-from)); 5593 } 5594 do 5595 { 5596 static assert(from < to); 5597 static if (from == 0) 5598 return x & ((1 << to)-1); 5599 else 5600 return (x >> from) & ((1<<(to-from))-1); 5601 } 5602 } 5603 5604 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5605 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5606 alias lo8 = assumeSize!(low_8, 8); 5607 alias mlo8 = assumeSize!(midlow_8, 8); 5608 5609 @safe pure nothrow @nogc unittest 5610 { 5611 static assert(bitSizeOf!lo8 == 8); 5612 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5613 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5614 } 5615 5616 template Sequence(size_t start, size_t end) 5617 { 5618 static if (start < end) 5619 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5620 else 5621 alias Sequence = AliasSeq!(); 5622 } 5623 5624 //---- TRIE TESTS ---- 5625 @system unittest 5626 { 5627 import std.algorithm.iteration : map; 5628 import std.algorithm.sorting : sort; 5629 import std.array : array; 5630 import std.conv : text, to; 5631 import std.range : iota; 5632 static trieStats(TRIE)(TRIE t) 5633 { 5634 version (std_uni_stats) 5635 { 5636 import std.stdio : writefln, writeln; 5637 writeln("---TRIE FOOTPRINT STATS---"); 5638 static foreach (i; 0 .. t.table.dim) 5639 { 5640 writefln("lvl%s = %s bytes; %s pages" 5641 , i, t.bytes!i, t.pages!i); 5642 } 5643 writefln("TOTAL: %s bytes", t.bytes); 5644 version (none) 5645 { 5646 writeln("INDEX (excluding value level):"); 5647 static foreach (i; 0 .. t.table.dim-1) 5648 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5649 } 5650 writeln("---------------------------"); 5651 } 5652 } 5653 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5654 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5655 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5656 alias Set = CodepointSet; 5657 auto set = Set('A','Z','a','z'); 5658 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5659 for (int a='a'; a<'z';a++) 5660 assert(trie[a]); 5661 for (int a='A'; a<'Z';a++) 5662 assert(trie[a]); 5663 for (int a=0; a<'A'; a++) 5664 assert(!trie[a]); 5665 for (int a ='Z'; a<'a'; a++) 5666 assert(!trie[a]); 5667 trieStats(trie); 5668 5669 auto redundant2 = Set( 5670 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5671 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5672 trieStats(trie2); 5673 foreach (e; redundant2.byCodepoint) 5674 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5675 foreach (i; 0 .. 1024) 5676 { 5677 assert(trie2[i] == (i in redundant2)); 5678 } 5679 5680 5681 auto redundant3 = Set( 5682 2, 4, 6, 8, 16, 5683 2+16, 4+16, 16+6, 16+8, 16+16, 5684 2+32, 4+32, 32+6, 32+8, 5685 ); 5686 5687 enum max3 = 256; 5688 // sliceBits 5689 auto trie3 = buildTrie!(bool, uint, max3, 5690 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5691 )(redundant3.byInterval); 5692 trieStats(trie3); 5693 foreach (i; 0 .. max3) 5694 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5695 5696 auto redundant4 = Set( 5697 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5698 1000, 2000, 3000, 4000, 5000, 6000 5699 ); 5700 enum max4 = 2^^16; 5701 auto trie4 = buildTrie!(bool, size_t, max4, 5702 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5703 )(redundant4.byInterval); 5704 foreach (i; 0 .. max4) 5705 { 5706 if (i in redundant4) 5707 assert(trie4[i], text(cast(uint) i)); 5708 } 5709 trieStats(trie4); 5710 5711 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5712 string[] redundantS = ["tea", "start", "orange"]; 5713 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5714 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5715 // using first char only 5716 assert(redundantS == ["orange", "start", "tea"]); 5717 assert(strie["test"], text(strie["test"])); 5718 assert(!strie["aea"]); 5719 assert(strie["s"]); 5720 5721 // a bit size test 5722 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5723 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5724 trieStats(bt); 5725 foreach (i; 0 .. 256) 5726 assert(bt[cast(ubyte) i]); 5727 } 5728 5729 template useItemAt(size_t idx, T) 5730 if (isIntegral!T || is(T: dchar)) 5731 { 5732 size_t impl(const scope T[] arr){ return arr[idx]; } 5733 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5734 } 5735 5736 template useLastItem(T) 5737 { 5738 size_t impl(const scope T[] arr){ return arr[$-1]; } 5739 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5740 } 5741 5742 template fullBitSize(Prefix...) 5743 { 5744 static if (Prefix.length > 0) 5745 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5746 else 5747 enum fullBitSize = 0; 5748 } 5749 5750 template idxTypes(Key, size_t fullBits, Prefix...) 5751 { 5752 static if (Prefix.length == 1) 5753 {// the last level is value level, so no index once reduced to 1-level 5754 alias idxTypes = AliasSeq!(); 5755 } 5756 else 5757 { 5758 // Important note on bit packing 5759 // Each level has to hold enough of bits to address the next one 5760 // The bottom level is known to hold full bit width 5761 // thus it's size in pages is full_bit_width - size_of_last_prefix 5762 // Recourse on this notion 5763 alias idxTypes = 5764 AliasSeq!( 5765 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5766 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5767 ); 5768 } 5769 } 5770 5771 //============================================================================ 5772 5773 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5774 if (is(Char1 : dchar) && is(Char2 : dchar)) 5775 { 5776 import std.algorithm.comparison : cmp; 5777 import std.algorithm.iteration : map, filter; 5778 import std.ascii : toLower; 5779 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5780 return cmp( 5781 a.map!toLower.filter!pred, 5782 b.map!toLower.filter!pred); 5783 } 5784 5785 @safe pure unittest 5786 { 5787 assert(!comparePropertyName("foo-bar", "fooBar")); 5788 } 5789 5790 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5791 if (is(Char1 : dchar) && is(Char2 : dchar)) 5792 { 5793 return comparePropertyName(a, b) < 0; 5794 } 5795 5796 //============================================================================ 5797 // Utilities for compression of Unicode code point sets 5798 //============================================================================ 5799 5800 @safe void compressTo(uint val, ref ubyte[] arr) pure nothrow 5801 { 5802 // not optimized as usually done 1 time (and not public interface) 5803 if (val < 128) 5804 arr ~= cast(ubyte) val; 5805 else if (val < (1 << 13)) 5806 { 5807 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5808 arr ~= val & 0xFF; 5809 } 5810 else 5811 { 5812 assert(val < (1 << 21)); 5813 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5814 arr ~= (val >> 8) & 0xFF; 5815 arr ~= val & 0xFF; 5816 } 5817 } 5818 5819 @safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure 5820 { 5821 import std.exception : enforce; 5822 immutable first = arr[idx++]; 5823 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5824 return first; 5825 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5826 uint val = (first & 0x1F); 5827 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5828 foreach (j; 0 .. extra) 5829 val = (val << 8) | arr[idx+j]; 5830 idx += extra; 5831 return val; 5832 } 5833 5834 5835 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5836 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5837 { 5838 ubyte[] storage; 5839 uint base = 0; 5840 // RLE encode 5841 foreach (val; intervals) 5842 { 5843 compressTo(val[0]-base, storage); 5844 base = val[0]; 5845 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5846 { 5847 compressTo(val[1]-base, storage); 5848 base = val[1]; 5849 } 5850 } 5851 return storage; 5852 } 5853 5854 @safe pure unittest 5855 { 5856 import std.algorithm.comparison : equal; 5857 import std.typecons : tuple; 5858 5859 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5860 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5861 assert(compressIntervals(run) == enc); 5862 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5863 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5864 assert(compressIntervals(run2) == enc2); 5865 size_t idx = 0; 5866 assert(decompressFrom(enc, idx) == 80); 5867 assert(decompressFrom(enc, idx) == 47); 5868 assert(decompressFrom(enc, idx) == 1); 5869 assert(decompressFrom(enc, idx) == (1 << 10)); 5870 idx = 0; 5871 assert(decompressFrom(enc2, idx) == 0); 5872 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5873 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5874 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5875 } 5876 5877 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5878 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5879 { 5880 return DecompressedIntervals(data); 5881 } 5882 5883 @safe struct DecompressedIntervals 5884 { 5885 pure: 5886 const(ubyte)[] _stream; 5887 size_t _idx; 5888 CodepointInterval _front; 5889 5890 this(const(ubyte)[] stream) 5891 { 5892 _stream = stream; 5893 popFront(); 5894 } 5895 5896 @property CodepointInterval front() 5897 { 5898 assert(!empty); 5899 return _front; 5900 } 5901 5902 void popFront() 5903 { 5904 if (_idx == _stream.length) 5905 { 5906 _idx = size_t.max; 5907 return; 5908 } 5909 uint base = _front[1]; 5910 _front[0] = base + decompressFrom(_stream, _idx); 5911 if (_idx == _stream.length)// odd length ---> till the end 5912 _front[1] = lastDchar+1; 5913 else 5914 { 5915 base = _front[0]; 5916 _front[1] = base + decompressFrom(_stream, _idx); 5917 } 5918 } 5919 5920 @property bool empty() const 5921 { 5922 return _idx == size_t.max; 5923 } 5924 5925 @property DecompressedIntervals save() { return this; } 5926 } 5927 5928 @safe pure nothrow @nogc unittest 5929 { 5930 static assert(isInputRange!DecompressedIntervals); 5931 static assert(isForwardRange!DecompressedIntervals); 5932 } 5933 5934 //============================================================================ 5935 5936 version (std_uni_bootstrap){} 5937 else 5938 { 5939 5940 // helper for looking up code point sets 5941 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5942 { 5943 import std.algorithm.iteration : map; 5944 import std.range : assumeSorted; 5945 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5946 (table.map!"a.name"()); 5947 size_t idx = range.lowerBound(name).length; 5948 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5949 return idx; 5950 return -1; 5951 } 5952 5953 // another one that loads it 5954 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5955 { 5956 auto idx = findUnicodeSet!table(name); 5957 if (idx >= 0) 5958 { 5959 dest = Set(asSet(table[idx].compressed)); 5960 return true; 5961 } 5962 return false; 5963 } 5964 5965 bool loadProperty(Set=CodepointSet, C) 5966 (const scope C[] name, ref Set target) pure 5967 { 5968 import std.internal.unicode_tables : uniProps; // generated file 5969 alias ucmp = comparePropertyName; 5970 // conjure cumulative properties by hand 5971 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5972 { 5973 target = asSet(uniProps.Lu); 5974 target |= asSet(uniProps.Ll); 5975 target |= asSet(uniProps.Lt); 5976 target |= asSet(uniProps.Lo); 5977 target |= asSet(uniProps.Lm); 5978 } 5979 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5980 { 5981 target = asSet(uniProps.Ll); 5982 target |= asSet(uniProps.Lu); 5983 target |= asSet(uniProps.Lt);// Title case 5984 } 5985 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 5986 { 5987 target = asSet(uniProps.Mn); 5988 target |= asSet(uniProps.Mc); 5989 target |= asSet(uniProps.Me); 5990 } 5991 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 5992 { 5993 target = asSet(uniProps.Nd); 5994 target |= asSet(uniProps.Nl); 5995 target |= asSet(uniProps.No); 5996 } 5997 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 5998 { 5999 target = asSet(uniProps.Pc); 6000 target |= asSet(uniProps.Pd); 6001 target |= asSet(uniProps.Ps); 6002 target |= asSet(uniProps.Pe); 6003 target |= asSet(uniProps.Pi); 6004 target |= asSet(uniProps.Pf); 6005 target |= asSet(uniProps.Po); 6006 } 6007 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6008 { 6009 target = asSet(uniProps.Sm); 6010 target |= asSet(uniProps.Sc); 6011 target |= asSet(uniProps.Sk); 6012 target |= asSet(uniProps.So); 6013 } 6014 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6015 { 6016 target = asSet(uniProps.Zs); 6017 target |= asSet(uniProps.Zl); 6018 target |= asSet(uniProps.Zp); 6019 } 6020 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6021 { 6022 target = asSet(uniProps.Co); 6023 target |= asSet(uniProps.Lo); 6024 target |= asSet(uniProps.No); 6025 target |= asSet(uniProps.So); 6026 target |= asSet(uniProps.Po); 6027 } 6028 else if (ucmp(name, "graphical") == 0) 6029 { 6030 target = asSet(uniProps.Alphabetic); 6031 6032 target |= asSet(uniProps.Mn); 6033 target |= asSet(uniProps.Mc); 6034 target |= asSet(uniProps.Me); 6035 6036 target |= asSet(uniProps.Nd); 6037 target |= asSet(uniProps.Nl); 6038 target |= asSet(uniProps.No); 6039 6040 target |= asSet(uniProps.Pc); 6041 target |= asSet(uniProps.Pd); 6042 target |= asSet(uniProps.Ps); 6043 target |= asSet(uniProps.Pe); 6044 target |= asSet(uniProps.Pi); 6045 target |= asSet(uniProps.Pf); 6046 target |= asSet(uniProps.Po); 6047 6048 target |= asSet(uniProps.Zs); 6049 6050 target |= asSet(uniProps.Sm); 6051 target |= asSet(uniProps.Sc); 6052 target |= asSet(uniProps.Sk); 6053 target |= asSet(uniProps.So); 6054 } 6055 else if (ucmp(name, "any") == 0) 6056 target = Set.fromIntervals(0, 0x110000); 6057 else if (ucmp(name, "ascii") == 0) 6058 target = Set.fromIntervals(0, 0x80); 6059 else 6060 return loadUnicodeSet!(uniProps.tab)(name, target); 6061 return true; 6062 } 6063 6064 // CTFE-only helper for checking property names at compile-time 6065 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6066 { 6067 import std.algorithm.searching : find; 6068 auto names = [ 6069 "L", "Letter", 6070 "LC", "Cased Letter", 6071 "M", "Mark", 6072 "N", "Number", 6073 "P", "Punctuation", 6074 "S", "Symbol", 6075 "Z", "Separator", 6076 "Graphical", 6077 "any", 6078 "ascii" 6079 ]; 6080 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6081 return !x.empty; 6082 } 6083 6084 // ditto, CTFE-only, not optimized 6085 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6086 { 6087 return findUnicodeSet!table(name) >= 0; 6088 } 6089 6090 template SetSearcher(alias table, string kind) 6091 { 6092 /// Run-time checked search. 6093 static auto opCall(C)(const scope C[] name) 6094 if (is(C : dchar)) 6095 { 6096 import std.conv : to; 6097 CodepointSet set; 6098 if (loadUnicodeSet!table(name, set)) 6099 return set; 6100 throw new Exception("No unicode set for "~kind~" by name " 6101 ~name.to!string()~" was found."); 6102 } 6103 /// Compile-time checked search. 6104 static @property auto opDispatch(string name)() 6105 { 6106 static if (findSetName!table(name)) 6107 { 6108 CodepointSet set; 6109 loadUnicodeSet!table(name, set); 6110 return set; 6111 } 6112 else 6113 static assert(false, "No unicode set for "~kind~" by name " 6114 ~name~" was found."); 6115 } 6116 } 6117 6118 // Characters that need escaping in string posed as regular expressions 6119 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6120 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6121 6122 package(std) CodepointSet memoizeExpr(string expr)() 6123 { 6124 if (__ctfe) 6125 return mixin(expr); 6126 alias T = typeof(mixin(expr)); 6127 static T slot; 6128 static bool initialized; 6129 if (!initialized) 6130 { 6131 slot = mixin(expr); 6132 initialized = true; 6133 } 6134 return slot; 6135 } 6136 6137 //property for \w character class 6138 package(std) @property CodepointSet wordCharacter() @safe 6139 { 6140 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6141 | unicode.Me | unicode.Nd | unicode.Pc")(); 6142 } 6143 6144 //basic stack, just in case it gets used anywhere else then Parser 6145 package(std) struct Stack(T) 6146 { 6147 @safe: 6148 T[] data; 6149 @property bool empty(){ return data.empty; } 6150 6151 @property size_t length(){ return data.length; } 6152 6153 void push(T val){ data ~= val; } 6154 6155 @trusted T pop() 6156 { 6157 assert(!empty); 6158 auto val = data[$ - 1]; 6159 data = data[0 .. $ - 1]; 6160 if (!__ctfe) 6161 cast(void) data.assumeSafeAppend(); 6162 return val; 6163 } 6164 6165 @property ref T top() 6166 { 6167 assert(!empty); 6168 return data[$ - 1]; 6169 } 6170 } 6171 6172 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6173 //returns it's value and skips these maxDigit chars on success, throws on failure 6174 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6175 { 6176 import std.exception : enforce; 6177 //std.conv.parse is both @system and bogus 6178 uint val; 6179 for (int k = 0; k < maxDigit; k++) 6180 { 6181 enforce(!str.empty, "incomplete escape sequence"); 6182 //accepts ascii only, so it's OK to index directly 6183 immutable current = str.front; 6184 if ('0' <= current && current <= '9') 6185 val = val * 16 + current - '0'; 6186 else if ('a' <= current && current <= 'f') 6187 val = val * 16 + current -'a' + 10; 6188 else if ('A' <= current && current <= 'F') 6189 val = val * 16 + current - 'A' + 10; 6190 else 6191 throw new Exception("invalid escape sequence"); 6192 str.popFront(); 6193 } 6194 enforce(val <= 0x10FFFF, "invalid codepoint"); 6195 return val; 6196 } 6197 6198 @safe unittest 6199 { 6200 import std.algorithm.searching : canFind; 6201 import std.exception : collectException; 6202 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6203 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6204 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6205 foreach (v; non_hex) 6206 assert(collectException(parseUniHex(v, v.length)).msg 6207 .canFind("invalid escape sequence")); 6208 foreach (i, v; hex) 6209 assert(parseUniHex(v, v.length) == value[i]); 6210 string over = "0011FFFF"; 6211 assert(collectException(parseUniHex(over, over.length)).msg 6212 .canFind("invalid codepoint")); 6213 } 6214 6215 auto caseEnclose(CodepointSet set) 6216 { 6217 auto cased = set & unicode.LC; 6218 foreach (dchar ch; cased.byCodepoint) 6219 { 6220 foreach (c; simpleCaseFoldings(ch)) 6221 set |= c; 6222 } 6223 return set; 6224 } 6225 6226 /+ 6227 fetch codepoint set corresponding to a name (InBlock or binary property) 6228 +/ 6229 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6230 { 6231 CodepointSet s = unicode(name); 6232 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6233 if (casefold) 6234 s = caseEnclose(s); 6235 if (negated) 6236 s = s.inverted; 6237 return s; 6238 } 6239 6240 struct UnicodeSetParser(Range) 6241 { 6242 import std.exception : enforce; 6243 import std.typecons : tuple, Tuple; 6244 Range range; 6245 bool casefold_; 6246 6247 @property bool empty(){ return range.empty; } 6248 @property dchar front(){ return range.front; } 6249 void popFront(){ range.popFront(); } 6250 6251 //CodepointSet operations relatively in order of priority 6252 enum Operator:uint { 6253 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6254 } 6255 6256 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6257 //also fetches next set operation 6258 Tuple!(CodepointSet,Operator) parseCharTerm() 6259 { 6260 import std.range : drop; 6261 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6262 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6263 PotentialTwinSymbolOperator } 6264 Operator op = Operator.None; 6265 dchar last; 6266 CodepointSet set; 6267 State state = State.Start; 6268 6269 void addWithFlags(ref CodepointSet set, uint ch) 6270 { 6271 if (casefold_) 6272 { 6273 auto range = simpleCaseFoldings(ch); 6274 foreach (v; range) 6275 set |= v; 6276 } 6277 else 6278 set |= ch; 6279 } 6280 6281 static Operator twinSymbolOperator(dchar symbol) 6282 { 6283 switch (symbol) 6284 { 6285 case '|': 6286 return Operator.Union; 6287 case '-': 6288 return Operator.Difference; 6289 case '~': 6290 return Operator.SymDifference; 6291 case '&': 6292 return Operator.Intersection; 6293 default: 6294 assert(false); 6295 } 6296 } 6297 6298 L_CharTermLoop: 6299 for (;;) 6300 { 6301 final switch (state) 6302 { 6303 case State.Start: 6304 switch (front) 6305 { 6306 case '|': 6307 case '-': 6308 case '~': 6309 case '&': 6310 state = State.PotentialTwinSymbolOperator; 6311 last = front; 6312 break; 6313 case '[': 6314 op = Operator.Union; 6315 goto case; 6316 case ']': 6317 break L_CharTermLoop; 6318 case '\\': 6319 state = State.Escape; 6320 break; 6321 default: 6322 state = State.Char; 6323 last = front; 6324 } 6325 break; 6326 case State.Char: 6327 // xxx last front xxx 6328 switch (front) 6329 { 6330 case '|': 6331 case '~': 6332 case '&': 6333 // then last is treated as normal char and added as implicit union 6334 state = State.PotentialTwinSymbolOperator; 6335 addWithFlags(set, last); 6336 last = front; 6337 break; 6338 case '-': // still need more info 6339 state = State.CharDash; 6340 break; 6341 case '\\': 6342 set |= last; 6343 state = State.Escape; 6344 break; 6345 case '[': 6346 op = Operator.Union; 6347 goto case; 6348 case ']': 6349 addWithFlags(set, last); 6350 break L_CharTermLoop; 6351 default: 6352 state = State.Char; 6353 addWithFlags(set, last); 6354 last = front; 6355 } 6356 break; 6357 case State.PotentialTwinSymbolOperator: 6358 // xxx last front xxxx 6359 // where last = [|-&~] 6360 if (front == last) 6361 { 6362 op = twinSymbolOperator(last); 6363 popFront();//skip second twin char 6364 break L_CharTermLoop; 6365 } 6366 goto case State.Char; 6367 case State.Escape: 6368 // xxx \ front xxx 6369 switch (front) 6370 { 6371 case 'f': 6372 last = '\f'; 6373 state = State.Char; 6374 break; 6375 case 'n': 6376 last = '\n'; 6377 state = State.Char; 6378 break; 6379 case 'r': 6380 last = '\r'; 6381 state = State.Char; 6382 break; 6383 case 't': 6384 last = '\t'; 6385 state = State.Char; 6386 break; 6387 case 'v': 6388 last = '\v'; 6389 state = State.Char; 6390 break; 6391 case 'c': 6392 last = unicode.parseControlCode(this); 6393 state = State.Char; 6394 break; 6395 foreach (val; Escapables) 6396 { 6397 case val: 6398 } 6399 last = front; 6400 state = State.Char; 6401 break; 6402 case 'p': 6403 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6404 state = State.Start; 6405 continue L_CharTermLoop; //next char already fetched 6406 case 'P': 6407 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6408 state = State.Start; 6409 continue L_CharTermLoop; //next char already fetched 6410 case 'x': 6411 popFront(); 6412 last = parseUniHex(this, 2); 6413 state = State.Char; 6414 continue L_CharTermLoop; 6415 case 'u': 6416 popFront(); 6417 last = parseUniHex(this, 4); 6418 state = State.Char; 6419 continue L_CharTermLoop; 6420 case 'U': 6421 popFront(); 6422 last = parseUniHex(this, 8); 6423 state = State.Char; 6424 continue L_CharTermLoop; 6425 case 'd': 6426 set.add(unicode.Nd); 6427 state = State.Start; 6428 break; 6429 case 'D': 6430 set.add(unicode.Nd.inverted); 6431 state = State.Start; 6432 break; 6433 case 's': 6434 set.add(unicode.White_Space); 6435 state = State.Start; 6436 break; 6437 case 'S': 6438 set.add(unicode.White_Space.inverted); 6439 state = State.Start; 6440 break; 6441 case 'w': 6442 set.add(wordCharacter); 6443 state = State.Start; 6444 break; 6445 case 'W': 6446 set.add(wordCharacter.inverted); 6447 state = State.Start; 6448 break; 6449 default: 6450 if (front >= privateUseStart && front <= privateUseEnd) 6451 enforce(false, "no matching ']' found while parsing character class"); 6452 enforce(false, "invalid escape sequence"); 6453 } 6454 break; 6455 case State.CharDash: 6456 // xxx last - front xxx 6457 switch (front) 6458 { 6459 case '[': 6460 op = Operator.Union; 6461 goto case; 6462 case ']': 6463 //means dash is a single char not an interval specifier 6464 addWithFlags(set, last); 6465 addWithFlags(set, '-'); 6466 break L_CharTermLoop; 6467 case '-'://set Difference again 6468 addWithFlags(set, last); 6469 op = Operator.Difference; 6470 popFront();//skip '-' 6471 break L_CharTermLoop; 6472 case '\\': 6473 state = State.CharDashEscape; 6474 break; 6475 default: 6476 enforce(last <= front, "inverted range"); 6477 if (casefold_) 6478 { 6479 for (uint ch = last; ch <= front; ch++) 6480 addWithFlags(set, ch); 6481 } 6482 else 6483 set.add(last, front + 1); 6484 state = State.Start; 6485 } 6486 break; 6487 case State.CharDashEscape: 6488 //xxx last - \ front xxx 6489 uint end; 6490 switch (front) 6491 { 6492 case 'f': 6493 end = '\f'; 6494 break; 6495 case 'n': 6496 end = '\n'; 6497 break; 6498 case 'r': 6499 end = '\r'; 6500 break; 6501 case 't': 6502 end = '\t'; 6503 break; 6504 case 'v': 6505 end = '\v'; 6506 break; 6507 foreach (val; Escapables) 6508 { 6509 case val: 6510 } 6511 end = front; 6512 break; 6513 case 'c': 6514 end = unicode.parseControlCode(this); 6515 break; 6516 case 'x': 6517 popFront(); 6518 end = parseUniHex(this, 2); 6519 enforce(last <= end,"inverted range"); 6520 set.add(last, end + 1); 6521 state = State.Start; 6522 continue L_CharTermLoop; 6523 case 'u': 6524 popFront(); 6525 end = parseUniHex(this, 4); 6526 enforce(last <= end,"inverted range"); 6527 set.add(last, end + 1); 6528 state = State.Start; 6529 continue L_CharTermLoop; 6530 case 'U': 6531 popFront(); 6532 end = parseUniHex(this, 8); 6533 enforce(last <= end,"inverted range"); 6534 set.add(last, end + 1); 6535 state = State.Start; 6536 continue L_CharTermLoop; 6537 default: 6538 if (front >= privateUseStart && front <= privateUseEnd) 6539 enforce(false, "no matching ']' found while parsing character class"); 6540 enforce(false, "invalid escape sequence"); 6541 } 6542 // Lookahead to check if it's a \T 6543 // where T is sub-pattern terminator in multi-pattern scheme 6544 auto lookahead = range.save.drop(1); 6545 if (end == '\\' && !lookahead.empty) 6546 { 6547 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6548 enforce(false, "no matching ']' found while parsing character class"); 6549 } 6550 enforce(last <= end,"inverted range"); 6551 set.add(last, end + 1); 6552 state = State.Start; 6553 break; 6554 } 6555 popFront(); 6556 enforce(!empty, "unexpected end of CodepointSet"); 6557 } 6558 return tuple(set, op); 6559 } 6560 6561 alias ValStack = Stack!(CodepointSet); 6562 alias OpStack = Stack!(Operator); 6563 6564 CodepointSet parseSet() 6565 { 6566 ValStack vstack; 6567 OpStack opstack; 6568 import std.functional : unaryFun; 6569 enforce(!empty, "unexpected end of input"); 6570 enforce(front == '[', "expected '[' at the start of unicode set"); 6571 // 6572 static bool apply(Operator op, ref ValStack stack) 6573 { 6574 switch (op) 6575 { 6576 case Operator.Negate: 6577 enforce(!stack.empty, "no operand for '^'"); 6578 stack.top = stack.top.inverted; 6579 break; 6580 case Operator.Union: 6581 auto s = stack.pop();//2nd operand 6582 enforce(!stack.empty, "no operand for '||'"); 6583 stack.top.add(s); 6584 break; 6585 case Operator.Difference: 6586 auto s = stack.pop();//2nd operand 6587 enforce(!stack.empty, "no operand for '--'"); 6588 stack.top.sub(s); 6589 break; 6590 case Operator.SymDifference: 6591 auto s = stack.pop();//2nd operand 6592 enforce(!stack.empty, "no operand for '~~'"); 6593 stack.top ~= s; 6594 break; 6595 case Operator.Intersection: 6596 auto s = stack.pop();//2nd operand 6597 enforce(!stack.empty, "no operand for '&&'"); 6598 stack.top.intersect(s); 6599 break; 6600 default: 6601 return false; 6602 } 6603 return true; 6604 } 6605 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6606 { 6607 while (cond(opstack.top)) 6608 { 6609 if (!apply(opstack.pop(),vstack)) 6610 return false;//syntax error 6611 if (opstack.empty) 6612 return false; 6613 } 6614 return true; 6615 } 6616 6617 L_CharsetLoop: 6618 do 6619 { 6620 switch (front) 6621 { 6622 case '[': 6623 opstack.push(Operator.Open); 6624 popFront(); 6625 enforce(!empty, "unexpected end of character class"); 6626 if (front == '^') 6627 { 6628 opstack.push(Operator.Negate); 6629 popFront(); 6630 enforce(!empty, "unexpected end of character class"); 6631 } 6632 else if (front == ']') // []...] is special cased 6633 { 6634 popFront(); 6635 enforce(!empty, "wrong character set"); 6636 auto pair = parseCharTerm(); 6637 pair[0].add(']', ']'+1); 6638 if (pair[1] != Operator.None) 6639 { 6640 if (opstack.top == Operator.Union) 6641 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6642 opstack.push(pair[1]); 6643 } 6644 vstack.push(pair[0]); 6645 } 6646 break; 6647 case ']': 6648 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6649 "character class syntax error"); 6650 enforce(!opstack.empty, "unmatched ']'"); 6651 opstack.pop(); 6652 popFront(); 6653 if (opstack.empty) 6654 break L_CharsetLoop; 6655 auto pair = parseCharTerm(); 6656 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6657 { 6658 vstack.top.add(pair[0]);//apply union 6659 } 6660 if (pair[1] != Operator.None) 6661 { 6662 if (opstack.top == Operator.Union) 6663 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6664 opstack.push(pair[1]); 6665 } 6666 break; 6667 // 6668 default://yet another pair of term(op)? 6669 auto pair = parseCharTerm(); 6670 if (pair[1] != Operator.None) 6671 { 6672 if (opstack.top == Operator.Union) 6673 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6674 opstack.push(pair[1]); 6675 } 6676 vstack.push(pair[0]); 6677 } 6678 6679 }while (!empty || !opstack.empty); 6680 while (!opstack.empty) 6681 apply(opstack.pop(),vstack); 6682 assert(vstack.length == 1); 6683 return vstack.top; 6684 } 6685 } 6686 6687 /** 6688 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6689 a block, script or general category. 6690 6691 It uses well defined standard rules of property name lookup. 6692 This includes fuzzy matching of names, so that 6693 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6694 and yield the same set of white space $(CHARACTERS). 6695 */ 6696 @safe public struct unicode 6697 { 6698 import std.exception : enforce; 6699 /** 6700 Performs the lookup of set of $(CODEPOINTS) 6701 with compile-time correctness checking. 6702 This short-cut version combines 3 searches: 6703 across blocks, scripts, and common binary properties. 6704 6705 Note that since scripts and blocks overlap the 6706 usual trick to disambiguate is used - to get a block use 6707 `unicode.InBlockName`, to search a script 6708 use `unicode.ScriptName`. 6709 6710 See_Also: $(LREF block), $(LREF script) 6711 and (not included in this search) $(LREF hangulSyllableType). 6712 */ 6713 6714 static @property auto opDispatch(string name)() pure 6715 { 6716 static if (findAny(name)) 6717 return loadAny(name); 6718 else 6719 static assert(false, "No unicode set by name "~name~" was found."); 6720 } 6721 6722 /// 6723 @safe unittest 6724 { 6725 import std.exception : collectException; 6726 auto ascii = unicode.ASCII; 6727 assert(ascii['A']); 6728 assert(ascii['~']); 6729 assert(!ascii['\u00e0']); 6730 // matching is case-insensitive 6731 assert(ascii == unicode.ascII); 6732 assert(!ascii['à']); 6733 // underscores, '-' and whitespace in names are ignored too 6734 auto latin = unicode.in_latin1_Supplement; 6735 assert(latin['à']); 6736 assert(!latin['$']); 6737 // BTW Latin 1 Supplement is a block, hence "In" prefix 6738 assert(latin == unicode("In Latin 1 Supplement")); 6739 // run-time look up throws if no such set is found 6740 assert(collectException(unicode("InCyrilliac"))); 6741 } 6742 6743 /** 6744 The same lookup across blocks, scripts, or binary properties, 6745 but performed at run-time. 6746 This version is provided for cases where `name` 6747 is not known beforehand; otherwise compile-time 6748 checked $(LREF opDispatch) is typically a better choice. 6749 6750 See the $(S_LINK Unicode properties, table of properties) for available 6751 sets. 6752 */ 6753 static auto opCall(C)(const scope C[] name) 6754 if (is(C : dchar)) 6755 { 6756 return loadAny(name); 6757 } 6758 6759 /** 6760 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6761 6762 Note: 6763 Here block names are unambiguous as no scripts are searched 6764 and thus to search use simply `unicode.block.BlockName` notation. 6765 6766 See $(S_LINK Unicode properties, table of properties) for available sets. 6767 See_Also: $(S_LINK Unicode properties, table of properties). 6768 */ 6769 struct block 6770 { 6771 import std.internal.unicode_tables : blocks; // generated file 6772 mixin SetSearcher!(blocks.tab, "block"); 6773 } 6774 6775 /// 6776 @safe unittest 6777 { 6778 // use .block for explicitness 6779 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6780 } 6781 6782 /** 6783 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6784 6785 See the $(S_LINK Unicode properties, table of properties) for available 6786 sets. 6787 */ 6788 struct script 6789 { 6790 import std.internal.unicode_tables : scripts; // generated file 6791 mixin SetSearcher!(scripts.tab, "script"); 6792 } 6793 6794 /// 6795 @safe unittest 6796 { 6797 auto arabicScript = unicode.script.arabic; 6798 auto arabicBlock = unicode.block.arabic; 6799 // there is an intersection between script and block 6800 assert(arabicBlock['']); 6801 assert(arabicScript['']); 6802 // but they are different 6803 assert(arabicBlock != arabicScript); 6804 assert(arabicBlock == unicode.inArabic); 6805 assert(arabicScript == unicode.arabic); 6806 } 6807 6808 /** 6809 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6810 6811 Other non-binary properties (once supported) follow the same 6812 notation - `unicode.propertyName.propertyValue` for compile-time 6813 checked access and `unicode.propertyName(propertyValue)` 6814 for run-time checked one. 6815 6816 See the $(S_LINK Unicode properties, table of properties) for available 6817 sets. 6818 */ 6819 struct hangulSyllableType 6820 { 6821 import std.internal.unicode_tables : hangul; // generated file 6822 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6823 } 6824 6825 /// 6826 @safe unittest 6827 { 6828 // L here is syllable type not Letter as in unicode.L short-cut 6829 auto leadingVowel = unicode.hangulSyllableType("L"); 6830 // check that some leading vowels are present 6831 foreach (vowel; '\u1110'..'\u115F') 6832 assert(leadingVowel[vowel]); 6833 assert(leadingVowel == unicode.hangulSyllableType.L); 6834 } 6835 6836 //parse control code of form \cXXX, c assumed to be the current symbol 6837 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6838 { 6839 with(p) 6840 { 6841 popFront(); 6842 enforce(!empty, "Unfinished escape sequence"); 6843 enforce(('a' <= front && front <= 'z') 6844 || ('A' <= front && front <= 'Z'), 6845 "Only letters are allowed after \\c"); 6846 return front & 0x1f; 6847 } 6848 } 6849 6850 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6851 //\ - assumed to be processed, p - is current 6852 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6853 bool negated, bool casefold) 6854 { 6855 static import std.ascii; 6856 with(p) 6857 { 6858 enum MAX_PROPERTY = 128; 6859 char[MAX_PROPERTY] result; 6860 uint k = 0; 6861 popFront(); 6862 enforce(!empty, "eof parsing unicode property spec"); 6863 if (front == '{') 6864 { 6865 popFront(); 6866 while (k < MAX_PROPERTY && !empty && front !='}' 6867 && front !=':') 6868 { 6869 if (front != '-' && front != ' ' && front != '_') 6870 result[k++] = cast(char) std.ascii.toLower(front); 6871 popFront(); 6872 } 6873 enforce(k != MAX_PROPERTY, "invalid property name"); 6874 enforce(front == '}', "} expected "); 6875 } 6876 else 6877 {//single char properties e.g.: \pL, \pN ... 6878 enforce(front < 0x80, "invalid property name"); 6879 result[k++] = cast(char) front; 6880 } 6881 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6882 enforce(!s.empty, "unrecognized unicode property spec"); 6883 popFront(); 6884 return s; 6885 } 6886 } 6887 6888 /** 6889 Parse unicode codepoint set from given `range` using standard regex 6890 syntax '[...]'. The range is advanced skiping over regex set definition. 6891 `casefold` parameter determines if the set should be casefolded - that is 6892 include both lower and upper case versions for any letters in the set. 6893 */ 6894 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6895 if (isInputRange!Range && is(ElementType!Range : dchar)) 6896 { 6897 auto usParser = UnicodeSetParser!Range(range, casefold); 6898 auto set = usParser.parseSet(); 6899 range = usParser.range; 6900 return set; 6901 } 6902 6903 /// 6904 @safe unittest 6905 { 6906 import std.uni : unicode; 6907 string pat = "[a-zA-Z0-9]hello"; 6908 auto set = unicode.parseSet(pat); 6909 // check some of the codepoints 6910 assert(set['a'] && set['A'] && set['9']); 6911 assert(pat == "hello"); 6912 } 6913 6914 private: 6915 alias ucmp = comparePropertyName; 6916 6917 static bool findAny(string name) 6918 { 6919 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6920 return isPrettyPropertyName(name) 6921 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6922 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6923 } 6924 6925 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6926 { 6927 import std.conv : to; 6928 import std.internal.unicode_tables : blocks, scripts; // generated file 6929 Set set; 6930 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6931 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6932 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6933 if (loaded) 6934 return set; 6935 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6936 } 6937 6938 // FIXME: re-disable once the compiler is fixed 6939 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6940 //@disable ~this(); 6941 } 6942 6943 @safe unittest 6944 { 6945 import std.internal.unicode_tables : blocks, uniProps; // generated file 6946 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6947 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6948 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6949 } 6950 6951 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6952 6953 // control - '\r' 6954 enum controlSwitch = ` 6955 case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':.. 6956 case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085': 6957 `; 6958 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6959 // kill unrolled switches 6960 6961 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6962 { 6963 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6964 } 6965 6966 template genericDecodeGrapheme(bool getValue) 6967 { 6968 alias graphemeExtend = graphemeExtendTrie; 6969 alias spacingMark = mcTrie; 6970 static if (getValue) 6971 alias Value = Grapheme; 6972 else 6973 alias Value = void; 6974 6975 Value genericDecodeGrapheme(Input)(ref Input range) 6976 { 6977 import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file 6978 enum GraphemeState { 6979 Start, 6980 CR, 6981 RI, 6982 L, 6983 V, 6984 LVT 6985 } 6986 static if (getValue) 6987 Grapheme grapheme; 6988 auto state = GraphemeState.Start; 6989 enum eat = q{ 6990 static if (getValue) 6991 grapheme ~= ch; 6992 range.popFront(); 6993 }; 6994 6995 dchar ch; 6996 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 6997 while (!range.empty) 6998 { 6999 ch = range.front; 7000 final switch (state) with(GraphemeState) 7001 { 7002 case Start: 7003 mixin(eat); 7004 if (ch == '\r') 7005 state = CR; 7006 else if (isRegionalIndicator(ch)) 7007 state = RI; 7008 else if (isHangL(ch)) 7009 state = L; 7010 else if (hangLV[ch] || isHangV(ch)) 7011 state = V; 7012 else if (hangLVT[ch]) 7013 state = LVT; 7014 else if (isHangT(ch)) 7015 state = LVT; 7016 else 7017 { 7018 switch (ch) 7019 { 7020 mixin(controlSwitch); 7021 goto L_End; 7022 default: 7023 goto L_End_Extend; 7024 } 7025 } 7026 break; 7027 case CR: 7028 if (ch == '\n') 7029 mixin(eat); 7030 goto L_End_Extend; 7031 case RI: 7032 if (isRegionalIndicator(ch)) 7033 mixin(eat); 7034 else 7035 goto L_End_Extend; 7036 break; 7037 case L: 7038 if (isHangL(ch)) 7039 mixin(eat); 7040 else if (isHangV(ch) || hangLV[ch]) 7041 { 7042 state = V; 7043 mixin(eat); 7044 } 7045 else if (hangLVT[ch]) 7046 { 7047 state = LVT; 7048 mixin(eat); 7049 } 7050 else 7051 goto L_End_Extend; 7052 break; 7053 case V: 7054 if (isHangV(ch)) 7055 mixin(eat); 7056 else if (isHangT(ch)) 7057 { 7058 state = LVT; 7059 mixin(eat); 7060 } 7061 else 7062 goto L_End_Extend; 7063 break; 7064 case LVT: 7065 if (isHangT(ch)) 7066 { 7067 mixin(eat); 7068 } 7069 else 7070 goto L_End_Extend; 7071 break; 7072 } 7073 } 7074 L_End_Extend: 7075 while (!range.empty) 7076 { 7077 ch = range.front; 7078 // extend & spacing marks 7079 if (!graphemeExtend[ch] && !spacingMark[ch]) 7080 break; 7081 mixin(eat); 7082 } 7083 L_End: 7084 static if (getValue) 7085 return grapheme; 7086 } 7087 7088 } 7089 7090 public: // Public API continues 7091 7092 /++ 7093 Computes the length of grapheme cluster starting at `index`. 7094 Both the resulting length and the `index` are measured 7095 in $(S_LINK Code unit, code units). 7096 7097 Params: 7098 C = type that is implicitly convertible to `dchars` 7099 input = array of grapheme clusters 7100 index = starting index into `input[]` 7101 7102 Returns: 7103 length of grapheme cluster 7104 +/ 7105 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7106 if (is(C : dchar)) 7107 { 7108 auto src = input[index..$]; 7109 auto n = src.length; 7110 genericDecodeGrapheme!(false)(src); 7111 return n - src.length; 7112 } 7113 7114 /// 7115 @safe unittest 7116 { 7117 assert(graphemeStride(" ", 1) == 1); 7118 // A + combing ring above 7119 string city = "A\u030Arhus"; 7120 size_t first = graphemeStride(city, 0); 7121 assert(first == 3); //\u030A has 2 UTF-8 code units 7122 assert(city[0 .. first] == "A\u030A"); 7123 assert(city[first..$] == "rhus"); 7124 } 7125 7126 @safe unittest 7127 { 7128 // Ensure that graphemeStride is usable from CTFE. 7129 enum c1 = graphemeStride("A", 0); 7130 static assert(c1 == 1); 7131 7132 enum c2 = graphemeStride("A\u0301", 0); 7133 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7134 } 7135 7136 /++ 7137 Reads one full grapheme cluster from an 7138 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7139 7140 For examples see the $(LREF Grapheme) below. 7141 7142 Note: 7143 This function modifies `inp` and thus `inp` 7144 must be an L-value. 7145 +/ 7146 Grapheme decodeGrapheme(Input)(ref Input inp) 7147 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7148 { 7149 return genericDecodeGrapheme!true(inp); 7150 } 7151 7152 @system unittest 7153 { 7154 import std.algorithm.comparison : equal; 7155 7156 Grapheme gr; 7157 string s = " \u0020\u0308 "; 7158 gr = decodeGrapheme(s); 7159 assert(gr.length == 1 && gr[0] == ' '); 7160 gr = decodeGrapheme(s); 7161 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7162 s = "\u0300\u0308\u1100"; 7163 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7164 assert(equal(decodeGrapheme(s)[], "\u1100")); 7165 s = "\u11A8\u0308\uAC01"; 7166 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7167 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7168 } 7169 7170 /++ 7171 $(P Iterate a string by $(LREF Grapheme).) 7172 7173 $(P Useful for doing string manipulation that needs to be aware 7174 of graphemes.) 7175 7176 See_Also: 7177 $(LREF byCodePoint) 7178 +/ 7179 auto byGrapheme(Range)(Range range) 7180 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7181 { 7182 // TODO: Bidirectional access 7183 static struct Result(R) 7184 { 7185 private R _range; 7186 private Grapheme _front; 7187 7188 bool empty() @property 7189 { 7190 return _front.length == 0; 7191 } 7192 7193 Grapheme front() @property 7194 { 7195 return _front; 7196 } 7197 7198 void popFront() 7199 { 7200 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7201 } 7202 7203 static if (isForwardRange!R) 7204 { 7205 Result save() @property 7206 { 7207 return Result(_range.save, _front); 7208 } 7209 } 7210 } 7211 7212 auto result = Result!(Range)(range); 7213 result.popFront(); 7214 return result; 7215 } 7216 7217 /// 7218 @safe unittest 7219 { 7220 import std.algorithm.comparison : equal; 7221 import std.range.primitives : walkLength; 7222 import std.range : take, drop; 7223 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7224 assert(text.walkLength == 5); // 5 code points 7225 7226 auto gText = text.byGrapheme; 7227 assert(gText.walkLength == 4); // 4 graphemes 7228 7229 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7230 assert(gText.drop(3).equal("l".byGrapheme)); 7231 } 7232 7233 // For testing non-forward-range input ranges 7234 version (StdUnittest) 7235 private static struct InputRangeString 7236 { 7237 private string s; 7238 7239 bool empty() @property { return s.empty; } 7240 dchar front() @property { return s.front; } 7241 void popFront() { s.popFront(); } 7242 } 7243 7244 @system unittest 7245 { 7246 import std.algorithm.comparison : equal; 7247 import std.array : array; 7248 import std.range : retro; 7249 import std.range.primitives : walkLength; 7250 assert("".byGrapheme.walkLength == 0); 7251 7252 auto reverse = "le\u0308on"; 7253 assert(reverse.walkLength == 5); 7254 7255 auto gReverse = reverse.byGrapheme; 7256 assert(gReverse.walkLength == 4); 7257 7258 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7259 {{ 7260 assert(text.walkLength == 5); 7261 static assert(isForwardRange!(typeof(text))); 7262 7263 auto gText = text.byGrapheme; 7264 static assert(isForwardRange!(typeof(gText))); 7265 assert(gText.walkLength == 4); 7266 assert(gText.array.retro.equal(gReverse)); 7267 }} 7268 7269 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7270 static assert(!isForwardRange!(typeof(nonForwardRange))); 7271 assert(nonForwardRange.walkLength == 4); 7272 } 7273 7274 /++ 7275 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7276 7277 $(P Useful for converting the result to a string after doing operations 7278 on graphemes.) 7279 7280 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7281 +/ 7282 auto byCodePoint(Range)(Range range) 7283 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7284 { 7285 // TODO: Propagate bidirectional access 7286 static struct Result 7287 { 7288 private Range _range; 7289 private size_t i = 0; 7290 7291 bool empty() @property 7292 { 7293 return _range.empty; 7294 } 7295 7296 dchar front() @property 7297 { 7298 return _range.front[i]; 7299 } 7300 7301 void popFront() 7302 { 7303 ++i; 7304 7305 if (i >= _range.front.length) 7306 { 7307 _range.popFront(); 7308 i = 0; 7309 } 7310 } 7311 7312 static if (isForwardRange!Range) 7313 { 7314 Result save() @property 7315 { 7316 return Result(_range.save, i); 7317 } 7318 } 7319 } 7320 7321 return Result(range); 7322 } 7323 7324 /// Ditto 7325 auto byCodePoint(Range)(Range range) 7326 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7327 { 7328 import std.range.primitives : isBidirectionalRange, popBack; 7329 import std.traits : isNarrowString; 7330 static if (isNarrowString!Range) 7331 { 7332 static struct Result 7333 { 7334 private Range _range; 7335 @property bool empty() { return _range.empty; } 7336 @property dchar front(){ return _range.front; } 7337 void popFront(){ _range.popFront; } 7338 @property auto save() { return Result(_range.save); } 7339 @property dchar back(){ return _range.back; } 7340 void popBack(){ _range.popBack; } 7341 } 7342 static assert(isBidirectionalRange!(Result)); 7343 return Result(range); 7344 } 7345 else 7346 return range; 7347 } 7348 7349 /// 7350 @safe unittest 7351 { 7352 import std.array : array; 7353 import std.conv : text; 7354 import std.range : retro; 7355 7356 string s = "noe\u0308l"; // noël 7357 7358 // reverse it and convert the result to a string 7359 string reverse = s.byGrapheme 7360 .array 7361 .retro 7362 .byCodePoint 7363 .text; 7364 7365 assert(reverse == "le\u0308on"); // lëon 7366 } 7367 7368 @system unittest 7369 { 7370 import std.algorithm.comparison : equal; 7371 import std.range.primitives : walkLength; 7372 import std.range : retro; 7373 assert("".byGrapheme.byCodePoint.equal("")); 7374 7375 string text = "noe\u0308l"; 7376 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7377 7378 auto gText = InputRangeString(text).byGrapheme; 7379 static assert(!isForwardRange!(typeof(gText))); 7380 7381 auto cpText = gText.byCodePoint; 7382 static assert(!isForwardRange!(typeof(cpText))); 7383 7384 assert(cpText.walkLength == text.walkLength); 7385 7386 auto plainCp = text.byCodePoint; 7387 static assert(isForwardRange!(typeof(plainCp))); 7388 assert(equal(plainCp, text)); 7389 assert(equal(retro(plainCp.save), retro(text.save))); 7390 // Check that we still have length for dstring 7391 assert("абвгд"d.byCodePoint.length == 5); 7392 } 7393 7394 /++ 7395 $(P A structure designed to effectively pack $(CHARACTERS) 7396 of a $(CLUSTER). 7397 ) 7398 7399 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7400 always refer to distinct objects. In most actual scenarios a `Grapheme` 7401 fits on the stack and avoids memory allocation overhead for all but quite 7402 long clusters. 7403 ) 7404 7405 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7406 +/ 7407 @safe struct Grapheme 7408 { 7409 import std.exception : enforce; 7410 import std.traits : isDynamicArray; 7411 7412 public: 7413 /// Ctor 7414 this(C)(const scope C[] chars...) 7415 if (is(C : dchar)) 7416 { 7417 this ~= chars; 7418 } 7419 7420 ///ditto 7421 this(Input)(Input seq) 7422 if (!isDynamicArray!Input 7423 && isInputRange!Input && is(ElementType!Input : dchar)) 7424 { 7425 this ~= seq; 7426 } 7427 7428 /// Gets a $(CODEPOINT) at the given index in this cluster. 7429 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7430 { 7431 assert(index < length); 7432 return read24(isBig ? ptr_ : small_.ptr, index); 7433 } 7434 7435 /++ 7436 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7437 7438 Warning: 7439 Use of this facility may invalidate grapheme cluster, 7440 see also $(LREF Grapheme.valid). 7441 +/ 7442 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7443 { 7444 assert(index < length); 7445 write24(isBig ? ptr_ : small_.ptr, ch, index); 7446 } 7447 7448 /// 7449 @safe unittest 7450 { 7451 auto g = Grapheme("A\u0302"); 7452 assert(g[0] == 'A'); 7453 assert(g.valid); 7454 g[1] = '~'; // ASCII tilda is not a combining mark 7455 assert(g[1] == '~'); 7456 assert(!g.valid); 7457 } 7458 7459 /++ 7460 Random-access range over Grapheme's $(CHARACTERS). 7461 7462 Warning: Invalidates when this Grapheme leaves the scope, 7463 attempts to use it then would lead to memory corruption. 7464 +/ 7465 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7466 { 7467 return sliceOverIndexed(a, b, &this); 7468 } 7469 7470 /// ditto 7471 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7472 { 7473 return sliceOverIndexed(0, length, &this); 7474 } 7475 7476 /// Grapheme cluster length in $(CODEPOINTS). 7477 @property size_t length() const @nogc nothrow pure 7478 { 7479 return isBig ? len_ : slen_ & 0x7F; 7480 } 7481 7482 /++ 7483 Append $(CHARACTER) `ch` to this grapheme. 7484 Warning: 7485 Use of this facility may invalidate grapheme cluster, 7486 see also `valid`. 7487 7488 See_Also: $(LREF Grapheme.valid) 7489 +/ 7490 ref opOpAssign(string op)(dchar ch) @trusted 7491 { 7492 static if (op == "~") 7493 { 7494 import std.internal.memory : enforceRealloc; 7495 if (!isBig) 7496 { 7497 if (slen_ == small_cap) 7498 convertToBig();// & fallthrough to "big" branch 7499 else 7500 { 7501 write24(small_.ptr, ch, smallLength); 7502 slen_++; 7503 return this; 7504 } 7505 } 7506 7507 assert(isBig); 7508 if (len_ == cap_) 7509 { 7510 import core.checkedint : addu, mulu; 7511 bool overflow; 7512 cap_ = addu(cap_, grow, overflow); 7513 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7514 if (overflow) assert(0); 7515 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7516 } 7517 write24(ptr_, ch, len_++); 7518 return this; 7519 } 7520 else 7521 static assert(false, "No operation "~op~" defined for Grapheme"); 7522 } 7523 7524 /// 7525 @system unittest 7526 { 7527 import std.algorithm.comparison : equal; 7528 auto g = Grapheme("A"); 7529 assert(g.valid); 7530 g ~= '\u0301'; 7531 assert(g[].equal("A\u0301")); 7532 assert(g.valid); 7533 g ~= "B"; 7534 // not a valid grapheme cluster anymore 7535 assert(!g.valid); 7536 // still could be useful though 7537 assert(g[].equal("A\u0301B")); 7538 } 7539 7540 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7541 ref opOpAssign(string op, Input)(scope Input inp) 7542 if (isInputRange!Input && is(ElementType!Input : dchar)) 7543 { 7544 static if (op == "~") 7545 { 7546 foreach (dchar ch; inp) 7547 this ~= ch; 7548 return this; 7549 } 7550 else 7551 static assert(false, "No operation "~op~" defined for Grapheme"); 7552 } 7553 7554 /++ 7555 True if this object contains valid extended grapheme cluster. 7556 Decoding primitives of this module always return a valid `Grapheme`. 7557 7558 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7559 render it no longer valid. Certain applications may chose to use 7560 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7561 entirely. 7562 +/ 7563 @property bool valid()() /*const*/ 7564 { 7565 auto r = this[]; 7566 genericDecodeGrapheme!false(r); 7567 return r.length == 0; 7568 } 7569 7570 this(this) @nogc nothrow pure @trusted 7571 { 7572 import std.internal.memory : enforceMalloc; 7573 if (isBig) 7574 {// dup it 7575 import core.checkedint : addu, mulu; 7576 bool overflow; 7577 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7578 if (overflow) assert(0); 7579 7580 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7581 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7582 ptr_ = p; 7583 } 7584 } 7585 7586 ~this() @nogc nothrow pure @trusted 7587 { 7588 import core.memory : pureFree; 7589 if (isBig) 7590 { 7591 pureFree(ptr_); 7592 } 7593 } 7594 7595 7596 private: 7597 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7598 // "out of the blue" grow rate, needs testing 7599 // (though graphemes are typically small < 9) 7600 enum grow = 20; 7601 enum small_cap = small_bytes/3; 7602 enum small_flag = 0x80, small_mask = 0x7F; 7603 // 16 bytes in 32bits, should be enough for the majority of cases 7604 union 7605 { 7606 struct 7607 { 7608 ubyte* ptr_; 7609 size_t cap_; 7610 size_t len_; 7611 size_t padding_; 7612 } 7613 struct 7614 { 7615 ubyte[small_bytes] small_; 7616 ubyte slen_; 7617 } 7618 } 7619 7620 void convertToBig() @nogc nothrow pure @trusted 7621 { 7622 import std.internal.memory : enforceMalloc; 7623 static assert(grow.max / 3 - 1 >= grow); 7624 enum nbytes = 3 * (grow + 1); 7625 size_t k = smallLength; 7626 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7627 for (int i=0; i<k; i++) 7628 write24(p, read24(small_.ptr, i), i); 7629 // now we can overwrite small array data 7630 ptr_ = p; 7631 len_ = slen_; 7632 assert(grow > len_); 7633 cap_ = grow; 7634 setBig(); 7635 } 7636 7637 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7638 7639 @property size_t smallLength() const @nogc nothrow pure 7640 { 7641 return slen_ & small_mask; 7642 } 7643 @property ubyte isBig() const @nogc nothrow pure 7644 { 7645 return slen_ & small_flag; 7646 } 7647 } 7648 7649 static assert(Grapheme.sizeof == size_t.sizeof*4); 7650 7651 7652 @system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7653 { 7654 import std.algorithm.comparison : equal; 7655 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7656 assert(byGrapheme("ЮУЗ").equal(data[])); 7657 } 7658 7659 /// 7660 @system unittest 7661 { 7662 import std.algorithm.comparison : equal; 7663 import std.algorithm.iteration : filter; 7664 import std.range : isRandomAccessRange; 7665 7666 string bold = "ku\u0308hn"; 7667 7668 // note that decodeGrapheme takes parameter by ref 7669 auto first = decodeGrapheme(bold); 7670 7671 assert(first.length == 1); 7672 assert(first[0] == 'k'); 7673 7674 // the next grapheme is 2 characters long 7675 auto wideOne = decodeGrapheme(bold); 7676 // slicing a grapheme yields a random-access range of dchar 7677 assert(wideOne[].equal("u\u0308")); 7678 assert(wideOne.length == 2); 7679 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7680 7681 // all of the usual range manipulation is possible 7682 assert(wideOne[].filter!isMark().equal("\u0308")); 7683 7684 auto g = Grapheme("A"); 7685 assert(g.valid); 7686 g ~= '\u0301'; 7687 assert(g[].equal("A\u0301")); 7688 assert(g.valid); 7689 g ~= "B"; 7690 // not a valid grapheme cluster anymore 7691 assert(!g.valid); 7692 // still could be useful though 7693 assert(g[].equal("A\u0301B")); 7694 } 7695 7696 @safe unittest 7697 { 7698 auto g = Grapheme("A\u0302"); 7699 assert(g[0] == 'A'); 7700 assert(g.valid); 7701 g[1] = '~'; // ASCII tilda is not a combining mark 7702 assert(g[1] == '~'); 7703 assert(!g.valid); 7704 } 7705 7706 @system unittest 7707 { 7708 import std.algorithm.comparison : equal; 7709 import std.algorithm.iteration : map; 7710 import std.conv : text; 7711 import std.range : iota; 7712 7713 // not valid clusters (but it just a test) 7714 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7715 assert(g[0] == 'a'); 7716 assert(g[1] == 'b'); 7717 assert(g[2] == 'c'); 7718 assert(g[3] == 'd'); 7719 assert(g[4] == 'e'); 7720 g[3] = 'Й'; 7721 assert(g[2] == 'c'); 7722 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7723 assert(g[4] == 'e'); 7724 assert(!g.valid); 7725 7726 g ~= 'ц'; 7727 g ~= '~'; 7728 assert(g[0] == 'a'); 7729 assert(g[1] == 'b'); 7730 assert(g[2] == 'c'); 7731 assert(g[3] == 'Й'); 7732 assert(g[4] == 'e'); 7733 assert(g[5] == 'ц'); 7734 assert(g[6] == '~'); 7735 assert(!g.valid); 7736 7737 Grapheme copy = g; 7738 copy[0] = 'X'; 7739 copy[1] = '-'; 7740 assert(g[0] == 'a' && copy[0] == 'X'); 7741 assert(g[1] == 'b' && copy[1] == '-'); 7742 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7743 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7744 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7745 copy ~= "xyz"; 7746 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7747 assert(!copy.valid); 7748 7749 Grapheme h; 7750 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7751 h ~= v; 7752 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7753 } 7754 7755 /++ 7756 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7757 This function uses simpler comparison rule thus achieving better performance 7758 than $(LREF icmp). However keep in mind the warning below.) 7759 7760 Params: 7761 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7762 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7763 7764 Returns: 7765 An `int` that is 0 if the strings match, 7766 <0 if `r1` is lexicographically "less" than `r2`, 7767 >0 if `r1` is lexicographically "greater" than `r2` 7768 7769 Warning: 7770 This function only handles 1:1 $(CODEPOINT) mapping 7771 and thus is not sufficient for certain alphabets 7772 like German, Greek and few others. 7773 7774 See_Also: 7775 $(LREF icmp) 7776 $(REF cmp, std,algorithm,comparison) 7777 +/ 7778 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7779 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7780 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7781 { 7782 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7783 import std.range.primitives : isInfinite; 7784 import std.utf : decodeFront; 7785 import std.traits : isDynamicArray; 7786 import std.typecons : Yes; 7787 static import std.ascii; 7788 7789 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7790 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7791 && !(isInfinite!S1 && isInfinite!S2) 7792 && __traits(compiles, 7793 { 7794 size_t s = size_t.sizeof / 2; 7795 r1 = r1[s .. $]; 7796 r2 = r2[s .. $]; 7797 })) 7798 {{ 7799 // ASCII optimization for dynamic arrays & similar. 7800 size_t i = 0; 7801 static if (isInfinite!S1) 7802 immutable end = r2.length; 7803 else static if (isInfinite!S2) 7804 immutable end = r1.length; 7805 else 7806 immutable end = r1.length > r2.length ? r2.length : r1.length; 7807 for (; i < end; ++i) 7808 { 7809 auto lhs = r1[i]; 7810 auto rhs = r2[i]; 7811 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7812 if (lhs == rhs) continue; 7813 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7814 if (lowDiff) return lowDiff; 7815 } 7816 static if (isInfinite!S1) 7817 return 1; 7818 else static if (isInfinite!S2) 7819 return -1; 7820 else 7821 return (r1.length > r2.length) - (r2.length > r1.length); 7822 7823 NonAsciiPath: 7824 r1 = r1[i .. $]; 7825 r2 = r2[i .. $]; 7826 // Fall through to standard case. 7827 }} 7828 7829 while (!r1.empty) 7830 { 7831 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 7832 if (r2.empty) 7833 return 1; 7834 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 7835 int diff = lhs - rhs; 7836 if (!diff) 7837 continue; 7838 if ((lhs | rhs) < 0x80) 7839 { 7840 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7841 if (!d) continue; 7842 return d; 7843 } 7844 size_t idx = simpleCaseTrie[lhs]; 7845 size_t idx2 = simpleCaseTrie[rhs]; 7846 // simpleCaseTrie is packed index table 7847 if (idx != EMPTY_CASE_TRIE) 7848 { 7849 if (idx2 != EMPTY_CASE_TRIE) 7850 {// both cased chars 7851 // adjust idx --> start of bucket 7852 idx = idx - sTable[idx].n; 7853 idx2 = idx2 - sTable[idx2].n; 7854 if (idx == idx2)// one bucket, equivalent chars 7855 continue; 7856 else// not the same bucket 7857 diff = sTable[idx].ch - sTable[idx2].ch; 7858 } 7859 else 7860 diff = sTable[idx - sTable[idx].n].ch - rhs; 7861 } 7862 else if (idx2 != EMPTY_CASE_TRIE) 7863 { 7864 diff = lhs - sTable[idx2 - sTable[idx2].n].ch; 7865 } 7866 // one of chars is not cased at all 7867 return diff; 7868 } 7869 return int(r2.empty) - 1; 7870 } 7871 7872 /// 7873 @safe @nogc pure nothrow unittest 7874 { 7875 assert(sicmp("Август", "авгусТ") == 0); 7876 // Greek also works as long as there is no 1:M mapping in sight 7877 assert(sicmp("ΌΎ", "όύ") == 0); 7878 // things like the following won't get matched as equal 7879 // Greek small letter iota with dialytika and tonos 7880 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 7881 7882 // while icmp has no problem with that 7883 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 7884 assert(icmp("ΌΎ", "όύ") == 0); 7885 } 7886 7887 // overloads for the most common cases to reduce compile time 7888 @safe @nogc pure nothrow 7889 { 7890 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 7891 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 7892 7893 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 7894 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 7895 7896 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 7897 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 7898 } 7899 7900 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 7901 { 7902 import std.algorithm.searching : skipOver; 7903 import std.internal.unicode_tables : fullCaseTable; // generated file 7904 alias fTable = fullCaseTable; 7905 size_t idx = fullCaseTrie[lhs]; 7906 // fullCaseTrie is packed index table 7907 if (idx == EMPTY_CASE_TRIE) 7908 return lhs; 7909 immutable start = idx - fTable[idx].n; 7910 immutable end = fTable[idx].size + start; 7911 assert(fTable[start].entry_len == 1); 7912 for (idx=start; idx<end; idx++) 7913 { 7914 auto entryLen = fTable[idx].entry_len; 7915 if (entryLen == 1) 7916 { 7917 if (fTable[idx].seq[0] == rhs) 7918 { 7919 return 0; 7920 } 7921 } 7922 else 7923 {// OK it's a long chunk, like 'ss' for German 7924 dstring seq = fTable[idx].seq[0 .. entryLen]; 7925 if (rhs == seq[0] 7926 && rtail.skipOver(seq[1..$])) 7927 { 7928 // note that this path modifies rtail 7929 // iff we managed to get there 7930 return 0; 7931 } 7932 } 7933 } 7934 return fTable[start].seq[0]; // new remapped character for accurate diffs 7935 } 7936 7937 /++ 7938 Does case insensitive comparison of `r1` and `r2`. 7939 Follows the rules of full case-folding mapping. 7940 This includes matching as equal german ß with "ss" and 7941 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 7942 The cost of `icmp` being pedantically correct is 7943 slightly worse performance. 7944 7945 Params: 7946 r1 = a forward range of characters 7947 r2 = a forward range of characters 7948 7949 Returns: 7950 An `int` that is 0 if the strings match, 7951 <0 if `str1` is lexicographically "less" than `str2`, 7952 >0 if `str1` is lexicographically "greater" than `str2` 7953 7954 See_Also: 7955 $(LREF sicmp) 7956 $(REF cmp, std,algorithm,comparison) 7957 +/ 7958 int icmp(S1, S2)(S1 r1, S2 r2) 7959 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 7960 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7961 { 7962 import std.range.primitives : isInfinite; 7963 import std.traits : isDynamicArray; 7964 import std.utf : byDchar; 7965 static import std.ascii; 7966 7967 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7968 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7969 && !(isInfinite!S1 && isInfinite!S2) 7970 && __traits(compiles, 7971 { 7972 size_t s = size_t.max / 2; 7973 r1 = r1[s .. $]; 7974 r2 = r2[s .. $]; 7975 })) 7976 {{ 7977 // ASCII optimization for dynamic arrays & similar. 7978 size_t i = 0; 7979 static if (isInfinite!S1) 7980 immutable end = r2.length; 7981 else static if (isInfinite!S2) 7982 immutable end = r1.length; 7983 else 7984 immutable end = r1.length > r2.length ? r2.length : r1.length; 7985 for (; i < end; ++i) 7986 { 7987 auto lhs = r1[i]; 7988 auto rhs = r2[i]; 7989 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7990 if (lhs == rhs) continue; 7991 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7992 if (lowDiff) return lowDiff; 7993 } 7994 static if (isInfinite!S1) 7995 return 1; 7996 else static if (isInfinite!S2) 7997 return -1; 7998 else 7999 return (r1.length > r2.length) - (r2.length > r1.length); 8000 8001 NonAsciiPath: 8002 r1 = r1[i .. $]; 8003 r2 = r2[i .. $]; 8004 // Fall through to standard case. 8005 }} 8006 8007 auto str1 = r1.byDchar; 8008 auto str2 = r2.byDchar; 8009 8010 for (;;) 8011 { 8012 if (str1.empty) 8013 return str2.empty ? 0 : -1; 8014 immutable lhs = str1.front; 8015 if (str2.empty) 8016 return 1; 8017 immutable rhs = str2.front; 8018 str1.popFront(); 8019 str2.popFront(); 8020 if (!(lhs - rhs)) 8021 continue; 8022 // first try to match lhs to <rhs,right-tail> sequence 8023 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8024 if (!cmpLR) 8025 continue; 8026 // then rhs to <lhs,left-tail> sequence 8027 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8028 if (!cmpRL) 8029 continue; 8030 // cmpXX contain remapped codepoints 8031 // to obtain stable ordering of icmp 8032 return cmpLR - cmpRL; 8033 } 8034 } 8035 8036 /// 8037 @safe @nogc pure nothrow unittest 8038 { 8039 assert(icmp("Rußland", "Russland") == 0); 8040 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8041 } 8042 8043 /** 8044 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8045 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8046 */ 8047 @safe @nogc nothrow pure unittest 8048 { 8049 import std.utf : byDchar; 8050 8051 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8052 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8053 } 8054 8055 // test different character types 8056 @safe unittest 8057 { 8058 assert(icmp("Rußland", "Russland") == 0); 8059 assert(icmp("Rußland"w, "Russland") == 0); 8060 assert(icmp("Rußland", "Russland"w) == 0); 8061 assert(icmp("Rußland"w, "Russland"w) == 0); 8062 assert(icmp("Rußland"d, "Russland"w) == 0); 8063 assert(icmp("Rußland"w, "Russland"d) == 0); 8064 } 8065 8066 // overloads for the most common cases to reduce compile time 8067 @safe @nogc pure nothrow 8068 { 8069 int icmp(const(char)[] str1, const(char)[] str2) 8070 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8071 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8072 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8073 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8074 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8075 } 8076 8077 @safe unittest 8078 { 8079 import std.algorithm.sorting : sort; 8080 import std.conv : to; 8081 import std.exception : assertCTFEable; 8082 assertCTFEable!( 8083 { 8084 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8085 {{ 8086 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8087 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8088 { 8089 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8090 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8091 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8092 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8093 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8094 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8095 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8096 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8097 // Check example: 8098 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8099 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8100 } 8101 // check that the order is properly agnostic to the case 8102 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8103 sort!((a,b) => cfunc(a,b) < 0)(strs); 8104 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8105 }} 8106 assert(icmp("ßb", "ssa") > 0); 8107 // Check example: 8108 assert(icmp("Russland", "Rußland") == 0); 8109 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8110 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8111 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8112 // https://issues.dlang.org/show_bug.cgi?id=11057 8113 assert( icmp("K", "L") < 0 ); 8114 }); 8115 } 8116 8117 // https://issues.dlang.org/show_bug.cgi?id=17372 8118 @safe pure unittest 8119 { 8120 import std.algorithm.iteration : joiner, map; 8121 import std.algorithm.sorting : sort; 8122 import std.array : array; 8123 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8124 } 8125 8126 // This is package(std) for the moment to be used as a support tool for std.regex 8127 // It needs a better API 8128 /* 8129 Return a range of all $(CODEPOINTS) that casefold to 8130 and from this `ch`. 8131 */ 8132 package(std) auto simpleCaseFoldings(dchar ch) @safe 8133 { 8134 import std.internal.unicode_tables : simpleCaseTable; // generated file 8135 alias sTable = simpleCaseTable; 8136 static struct Range 8137 { 8138 @safe pure nothrow: 8139 uint idx; //if == uint.max, then read c. 8140 union 8141 { 8142 dchar c; // == 0 - empty range 8143 uint len; 8144 } 8145 @property bool isSmall() const { return idx == uint.max; } 8146 8147 this(dchar ch) 8148 { 8149 idx = uint.max; 8150 c = ch; 8151 } 8152 8153 this(uint start, uint size) 8154 { 8155 idx = start; 8156 len = size; 8157 } 8158 8159 @property dchar front() const 8160 { 8161 assert(!empty); 8162 if (isSmall) 8163 { 8164 return c; 8165 } 8166 auto ch = sTable[idx].ch; 8167 return ch; 8168 } 8169 8170 @property bool empty() const 8171 { 8172 if (isSmall) 8173 { 8174 return c == 0; 8175 } 8176 return len == 0; 8177 } 8178 8179 @property size_t length() const 8180 { 8181 if (isSmall) 8182 { 8183 return c == 0 ? 0 : 1; 8184 } 8185 return len; 8186 } 8187 8188 void popFront() 8189 { 8190 if (isSmall) 8191 c = 0; 8192 else 8193 { 8194 idx++; 8195 len--; 8196 } 8197 } 8198 } 8199 immutable idx = simpleCaseTrie[ch]; 8200 if (idx == EMPTY_CASE_TRIE) 8201 return Range(ch); 8202 auto entry = sTable[idx]; 8203 immutable start = idx - entry.n; 8204 return Range(start, entry.size); 8205 } 8206 8207 @system unittest 8208 { 8209 import std.algorithm.comparison : equal; 8210 import std.algorithm.searching : canFind; 8211 import std.array : array; 8212 import std.exception : assertCTFEable; 8213 assertCTFEable!((){ 8214 auto r = simpleCaseFoldings('Э').array; 8215 assert(r.length == 2); 8216 assert(r.canFind('э') && r.canFind('Э')); 8217 auto sr = simpleCaseFoldings('~'); 8218 assert(sr.equal("~")); 8219 //A with ring above - casefolds to the same bucket as Angstrom sign 8220 sr = simpleCaseFoldings('Å'); 8221 assert(sr.length == 3); 8222 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8223 }); 8224 } 8225 8226 /++ 8227 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8228 +/ 8229 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8230 { 8231 return combiningClassTrie[ch]; 8232 } 8233 8234 /// 8235 @safe unittest 8236 { 8237 // shorten the code 8238 alias CC = combiningClass; 8239 8240 // combining tilda 8241 assert(CC('\u0303') == 230); 8242 // combining ring below 8243 assert(CC('\u0325') == 220); 8244 // the simple consequence is that "tilda" should be 8245 // placed after a "ring below" in a sequence 8246 } 8247 8248 @safe pure nothrow @nogc unittest 8249 { 8250 foreach (ch; 0 .. 0x80) 8251 assert(combiningClass(ch) == 0); 8252 assert(combiningClass('\u05BD') == 22); 8253 assert(combiningClass('\u0300') == 230); 8254 assert(combiningClass('\u0317') == 220); 8255 assert(combiningClass('\u1939') == 222); 8256 } 8257 8258 /// Unicode character decomposition type. 8259 enum UnicodeDecomposition { 8260 /// Canonical decomposition. The result is canonically equivalent sequence. 8261 Canonical, 8262 /** 8263 Compatibility decomposition. The result is compatibility equivalent sequence. 8264 Note: Compatibility decomposition is a $(B lossy) conversion, 8265 typically suitable only for fuzzy matching and internal processing. 8266 */ 8267 Compatibility 8268 } 8269 8270 /** 8271 Shorthand aliases for character decomposition type, passed as a 8272 template parameter to $(LREF decompose). 8273 */ 8274 enum { 8275 Canonical = UnicodeDecomposition.Canonical, 8276 Compatibility = UnicodeDecomposition.Compatibility 8277 } 8278 8279 /++ 8280 Try to canonically compose 2 $(CHARACTERS). 8281 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8282 8283 The assumption is that `first` comes before `second` in the original text, 8284 usually meaning that the first is a starter. 8285 8286 Note: Hangul syllables are not covered by this function. 8287 See `composeJamo` below. 8288 +/ 8289 public dchar compose(dchar first, dchar second) pure nothrow @safe 8290 { 8291 import std.algorithm.iteration : map; 8292 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8293 import std.range : assumeSorted; 8294 immutable packed = compositionJumpTrie[first]; 8295 if (packed == ushort.max) 8296 return dchar.init; 8297 // unpack offset and length 8298 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8299 // TODO: optimize this micro binary search (no more then 4-5 steps) 8300 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); 8301 immutable target = r.lowerBound(second).length; 8302 if (target == cnt) 8303 return dchar.init; 8304 immutable entry = compositionTable[idx+target]; 8305 if (entry.rhs != second) 8306 return dchar.init; 8307 return entry.composed; 8308 } 8309 8310 /// 8311 @safe unittest 8312 { 8313 assert(compose('A','\u0308') == '\u00C4'); 8314 assert(compose('A', 'B') == dchar.init); 8315 assert(compose('C', '\u0301') == '\u0106'); 8316 // note that the starter is the first one 8317 // thus the following doesn't compose 8318 assert(compose('\u0308', 'A') == dchar.init); 8319 } 8320 8321 /++ 8322 Returns a full $(S_LINK Canonical decomposition, Canonical) 8323 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8324 decomposition of $(CHARACTER) `ch`. 8325 If no decomposition is available returns a $(LREF Grapheme) 8326 with the `ch` itself. 8327 8328 Note: 8329 This function also decomposes hangul syllables 8330 as prescribed by the standard. 8331 8332 See_Also: $(LREF decomposeHangul) for a restricted version 8333 that takes into account only hangul syllables but 8334 no other decompositions. 8335 +/ 8336 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8337 { 8338 import std.algorithm.searching : until; 8339 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8340 static if (decompType == Canonical) 8341 { 8342 alias table = decompCanonTable; 8343 alias mapping = canonMappingTrie; 8344 } 8345 else static if (decompType == Compatibility) 8346 { 8347 alias table = decompCompatTable; 8348 alias mapping = compatMappingTrie; 8349 } 8350 immutable idx = mapping[ch]; 8351 if (!idx) // not found, check hangul arithmetic decomposition 8352 return decomposeHangul(ch); 8353 auto decomp = table[idx..$].until(0); 8354 return Grapheme(decomp); 8355 } 8356 8357 /// 8358 @system unittest 8359 { 8360 import std.algorithm.comparison : equal; 8361 8362 assert(compose('A','\u0308') == '\u00C4'); 8363 assert(compose('A', 'B') == dchar.init); 8364 assert(compose('C', '\u0301') == '\u0106'); 8365 // note that the starter is the first one 8366 // thus the following doesn't compose 8367 assert(compose('\u0308', 'A') == dchar.init); 8368 8369 assert(decompose('Ĉ')[].equal("C\u0302")); 8370 assert(decompose('D')[].equal("D")); 8371 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8372 assert(decompose!Compatibility('¹')[].equal("1")); 8373 } 8374 8375 //---------------------------------------------------------------------------- 8376 // Hangul specific composition/decomposition 8377 enum jamoSBase = 0xAC00; 8378 enum jamoLBase = 0x1100; 8379 enum jamoVBase = 0x1161; 8380 enum jamoTBase = 0x11A7; 8381 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8382 enum jamoNCount = jamoVCount * jamoTCount; 8383 enum jamoSCount = jamoLCount * jamoNCount; 8384 8385 // Tests if `ch` is a Hangul leading consonant jamo. 8386 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8387 { 8388 // first cmp rejects ~ 1M code points above leading jamo range 8389 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8390 } 8391 8392 // Tests if `ch` is a Hangul vowel jamo. 8393 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8394 { 8395 // first cmp rejects ~ 1M code points above trailing jamo range 8396 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8397 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8398 } 8399 8400 // Tests if `ch` is a Hangul trailnig consonant jamo. 8401 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8402 { 8403 // first cmp rejects ~ 1M code points above vowel range 8404 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8405 } 8406 8407 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8408 { 8409 int idxS = cast(int) ch - jamoSBase; 8410 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8411 } 8412 8413 // internal helper: compose hangul syllables leaving dchar.init in holes 8414 void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe 8415 { 8416 for (size_t idx = 0; idx + 1 < seq.length; ) 8417 { 8418 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8419 { 8420 immutable int indexL = seq[idx] - jamoLBase; 8421 immutable int indexV = seq[idx+1] - jamoVBase; 8422 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8423 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8424 { 8425 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8426 seq[idx+1] = dchar.init; 8427 seq[idx+2] = dchar.init; 8428 idx += 3; 8429 } 8430 else 8431 { 8432 seq[idx] = jamoSBase + indexLV; 8433 seq[idx+1] = dchar.init; 8434 idx += 2; 8435 } 8436 } 8437 else 8438 idx++; 8439 } 8440 } 8441 8442 //---------------------------------------------------------------------------- 8443 public: 8444 8445 /** 8446 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8447 then this function returns $(LREF Grapheme) containing only `ch` as is. 8448 */ 8449 Grapheme decomposeHangul(dchar ch) @safe 8450 { 8451 immutable idxS = cast(int) ch - jamoSBase; 8452 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8453 immutable idxL = idxS / jamoNCount; 8454 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8455 immutable idxT = idxS % jamoTCount; 8456 8457 immutable partL = jamoLBase + idxL; 8458 immutable partV = jamoVBase + idxV; 8459 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8460 return Grapheme(partL, partV, jamoTBase + idxT); 8461 else // <L, V> decomposition 8462 return Grapheme(partL, partV); 8463 } 8464 8465 /// 8466 @system unittest 8467 { 8468 import std.algorithm.comparison : equal; 8469 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8470 } 8471 8472 /++ 8473 Try to compose hangul syllable out of a leading consonant (`lead`), 8474 a `vowel` and optional `trailing` consonant jamos. 8475 8476 On success returns the composed LV or LVT hangul syllable. 8477 8478 If any of `lead` and `vowel` are not a valid hangul jamo 8479 of the respective $(CHARACTER) class returns dchar.init. 8480 +/ 8481 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8482 { 8483 if (!isJamoL(lead)) 8484 return dchar.init; 8485 immutable indexL = lead - jamoLBase; 8486 if (!isJamoV(vowel)) 8487 return dchar.init; 8488 immutable indexV = vowel - jamoVBase; 8489 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8490 immutable dchar syllable = jamoSBase + indexLV; 8491 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8492 } 8493 8494 /// 8495 @safe unittest 8496 { 8497 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8498 // leaving out T-vowel, or passing any codepoint 8499 // that is not trailing consonant composes an LV-syllable 8500 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8501 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8502 assert(composeJamo('\u1111', 'A') == dchar.init); 8503 assert(composeJamo('A', '\u1171') == dchar.init); 8504 } 8505 8506 @system unittest 8507 { 8508 import std.algorithm.comparison : equal; 8509 import std.conv : text; 8510 8511 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8512 { 8513 Grapheme g = decompose!T(ch); 8514 assert(equal(g[], r), text(g[], " vs ", r)); 8515 } 8516 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8517 testDecomp!Canonical('\uF907', "\u9F9C"); 8518 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8519 testDecomp!Compatibility('\uA7F9', "\u0153"); 8520 8521 // check examples 8522 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8523 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8524 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8525 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8526 assert(composeJamo('\u1111', 'A') == dchar.init); 8527 assert(composeJamo('A', '\u1171') == dchar.init); 8528 } 8529 8530 /** 8531 Enumeration type for normalization forms, 8532 passed as template parameter for functions like $(LREF normalize). 8533 */ 8534 enum NormalizationForm { 8535 NFC, 8536 NFD, 8537 NFKC, 8538 NFKD 8539 } 8540 8541 8542 enum { 8543 /** 8544 Shorthand aliases from values indicating normalization forms. 8545 */ 8546 NFC = NormalizationForm.NFC, 8547 ///ditto 8548 NFD = NormalizationForm.NFD, 8549 ///ditto 8550 NFKC = NormalizationForm.NFKC, 8551 ///ditto 8552 NFKD = NormalizationForm.NFKD 8553 } 8554 8555 /++ 8556 Returns `input` string normalized to the chosen form. 8557 Form C is used by default. 8558 8559 For more information on normalization forms see 8560 the $(S_LINK Normalization, normalization section). 8561 8562 Note: 8563 In cases where the string in question is already normalized, 8564 it is returned unmodified and no memory allocation happens. 8565 +/ 8566 inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input) 8567 { 8568 import std.algorithm.mutation : SwapStrategy; 8569 import std.algorithm.sorting : sort; 8570 import std.array : appender; 8571 import std.range : zip; 8572 8573 auto anchors = splitNormalized!norm(input); 8574 if (anchors[0] == input.length && anchors[1] == input.length) 8575 return input; 8576 dchar[] decomposed; 8577 decomposed.reserve(31); 8578 ubyte[] ccc; 8579 ccc.reserve(31); 8580 auto app = appender!(C[])(); 8581 do 8582 { 8583 app.put(input[0 .. anchors[0]]); 8584 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8585 static if (norm == NFD || norm == NFC) 8586 { 8587 foreach (dchar c; decompose!Canonical(ch)[]) 8588 decomposed ~= c; 8589 } 8590 else // NFKD & NFKC 8591 { 8592 foreach (dchar c; decompose!Compatibility(ch)[]) 8593 decomposed ~= c; 8594 } 8595 ccc.length = decomposed.length; 8596 size_t firstNonStable = 0; 8597 ubyte lastClazz = 0; 8598 8599 foreach (idx, dchar ch; decomposed) 8600 { 8601 immutable clazz = combiningClass(ch); 8602 ccc[idx] = clazz; 8603 if (clazz == 0 && lastClazz != 0) 8604 { 8605 // found a stable code point after unstable ones 8606 sort!("a[0] < b[0]", SwapStrategy.stable) 8607 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8608 firstNonStable = decomposed.length; 8609 } 8610 else if (clazz != 0 && lastClazz == 0) 8611 { 8612 // found first unstable code point after stable ones 8613 firstNonStable = idx; 8614 } 8615 lastClazz = clazz; 8616 } 8617 sort!("a[0] < b[0]", SwapStrategy.stable) 8618 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8619 static if (norm == NFC || norm == NFKC) 8620 { 8621 import std.algorithm.searching : countUntil; 8622 auto first = countUntil(ccc, 0); 8623 if (first >= 0) // no starters?? no recomposition 8624 { 8625 for (;;) 8626 { 8627 immutable second = recompose(first, decomposed, ccc); 8628 if (second == decomposed.length) 8629 break; 8630 first = second; 8631 } 8632 // 2nd pass for hangul syllables 8633 hangulRecompose(decomposed); 8634 } 8635 } 8636 static if (norm == NFD || norm == NFKD) 8637 app.put(decomposed); 8638 else 8639 { 8640 import std.algorithm.mutation : remove; 8641 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8642 app.put(decomposed[0 .. clean.length]); 8643 } 8644 // reset variables 8645 decomposed.length = 0; 8646 () @trusted { 8647 decomposed.assumeSafeAppend(); 8648 ccc.length = 0; 8649 ccc.assumeSafeAppend(); 8650 } (); 8651 input = input[anchors[1]..$]; 8652 // and move on 8653 anchors = splitNormalized!norm(input); 8654 }while (anchors[0] != input.length); 8655 app.put(input[0 .. anchors[0]]); 8656 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8657 } 8658 8659 /// 8660 @safe unittest 8661 { 8662 // any encoding works 8663 wstring greet = "Hello world"; 8664 assert(normalize(greet) is greet); // the same exact slice 8665 8666 // An example of a character with all 4 forms being different: 8667 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8668 assert(normalize!NFC("ϓ") == "\u03D3"); 8669 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8670 assert(normalize!NFKC("ϓ") == "\u038E"); 8671 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8672 } 8673 8674 @safe unittest 8675 { 8676 import std.conv : text; 8677 8678 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8679 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8680 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8681 8682 // check example 8683 8684 // any encoding works 8685 wstring greet = "Hello world"; 8686 assert(normalize(greet) is greet); // the same exact slice 8687 8688 // An example of a character with all 4 forms being different: 8689 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8690 assert(normalize!NFC("ϓ") == "\u03D3"); 8691 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8692 assert(normalize!NFKC("ϓ") == "\u038E"); 8693 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8694 } 8695 8696 // canonically recompose given slice of code points, works in-place and mutates data 8697 private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe 8698 { 8699 assert(input.length == ccc.length); 8700 int accumCC = -1;// so that it's out of 0 .. 255 range 8701 // writefln("recomposing %( %04x %)", input); 8702 // first one is always a starter thus we start at i == 1 8703 size_t i = start+1; 8704 for (; ; ) 8705 { 8706 if (i == input.length) 8707 break; 8708 immutable curCC = ccc[i]; 8709 // In any character sequence beginning with a starter S 8710 // a character C is blocked from S if and only if there 8711 // is some character B between S and C, and either B 8712 // is a starter or it has the same or higher combining class as C. 8713 //------------------------ 8714 // Applying to our case: 8715 // S is input[0] 8716 // accumCC is the maximum CCC of characters between C and S, 8717 // as ccc are sorted 8718 // C is input[i] 8719 8720 if (curCC > accumCC) 8721 { 8722 immutable comp = compose(input[start], input[i]); 8723 if (comp != dchar.init) 8724 { 8725 input[start] = comp; 8726 input[i] = dchar.init;// put a sentinel 8727 // current was merged so its CCC shouldn't affect 8728 // composing with the next one 8729 } 8730 else 8731 { 8732 // if it was a starter then accumCC is now 0, end of loop 8733 accumCC = curCC; 8734 if (accumCC == 0) 8735 break; 8736 } 8737 } 8738 else 8739 { 8740 // ditto here 8741 accumCC = curCC; 8742 if (accumCC == 0) 8743 break; 8744 } 8745 i++; 8746 } 8747 return i; 8748 } 8749 8750 // returns tuple of 2 indexes that delimit: 8751 // normalized text, piece that needs normalization and 8752 // the rest of input starting with stable code point 8753 private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input) 8754 { 8755 import std.typecons : tuple; 8756 ubyte lastCC = 0; 8757 8758 foreach (idx, dchar ch; input) 8759 { 8760 static if (norm == NFC) 8761 if (ch < 0x0300) 8762 { 8763 lastCC = 0; 8764 continue; 8765 } 8766 immutable ubyte CC = combiningClass(ch); 8767 if (lastCC > CC && CC != 0) 8768 { 8769 return seekStable!norm(idx, input); 8770 } 8771 8772 if (notAllowedIn!norm(ch)) 8773 { 8774 return seekStable!norm(idx, input); 8775 } 8776 lastCC = CC; 8777 } 8778 return tuple(input.length, input.length); 8779 } 8780 8781 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8782 { 8783 import std.typecons : tuple; 8784 import std.utf : codeLength; 8785 8786 auto br = input[0 .. idx]; 8787 size_t region_start = 0;// default 8788 for (;;) 8789 { 8790 if (br.empty)// start is 0 8791 break; 8792 dchar ch = br.back; 8793 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8794 { 8795 region_start = br.length - codeLength!C(ch); 8796 break; 8797 } 8798 br.popFront(); 8799 } 8800 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8801 size_t region_end=input.length;// end is $ by default 8802 foreach (i, dchar ch; input[idx..$]) 8803 { 8804 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8805 { 8806 region_end = i+idx; 8807 break; 8808 } 8809 } 8810 // writeln("Region to normalize: ", input[region_start .. region_end]); 8811 return tuple(region_start, region_end); 8812 } 8813 8814 /** 8815 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 8816 form `norm`. 8817 */ 8818 public bool allowedIn(NormalizationForm norm)(dchar ch) 8819 { 8820 return !notAllowedIn!norm(ch); 8821 } 8822 8823 /// 8824 @safe unittest 8825 { 8826 // e.g. Cyrillic is always allowed, so is ASCII 8827 assert(allowedIn!NFC('я')); 8828 assert(allowedIn!NFD('я')); 8829 assert(allowedIn!NFKC('я')); 8830 assert(allowedIn!NFKD('я')); 8831 assert(allowedIn!NFC('Z')); 8832 } 8833 8834 // not user friendly name but more direct 8835 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 8836 { 8837 static if (norm == NFC) 8838 alias qcTrie = nfcQCTrie; 8839 else static if (norm == NFD) 8840 alias qcTrie = nfdQCTrie; 8841 else static if (norm == NFKC) 8842 alias qcTrie = nfkcQCTrie; 8843 else static if (norm == NFKD) 8844 alias qcTrie = nfkdQCTrie; 8845 else 8846 static assert("Unknown normalization form "~norm); 8847 return qcTrie[ch]; 8848 } 8849 8850 @safe unittest 8851 { 8852 assert(allowedIn!NFC('я')); 8853 assert(allowedIn!NFD('я')); 8854 assert(allowedIn!NFKC('я')); 8855 assert(allowedIn!NFKD('я')); 8856 assert(allowedIn!NFC('Z')); 8857 } 8858 8859 } 8860 8861 version (std_uni_bootstrap) 8862 { 8863 // old version used for bootstrapping of gen_uni.d that generates 8864 // up to date optimal versions of all of isXXX functions 8865 @safe pure nothrow @nogc public bool isWhite(dchar c) 8866 { 8867 import std.ascii : isWhite; 8868 return isWhite(c) || 8869 c == lineSep || c == paraSep || 8870 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 8871 (c >= '\u2000' && c <= '\u200A') || 8872 c == '\u202F' || c == '\u205F' || c == '\u3000'; 8873 } 8874 } 8875 else 8876 { 8877 8878 // trusted -> avoid bounds check 8879 @trusted pure nothrow @nogc private 8880 { 8881 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 8882 8883 // hide template instances behind functions 8884 // https://issues.dlang.org/show_bug.cgi?id=13232 8885 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 8886 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 8887 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 8888 8889 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 8890 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 8891 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 8892 8893 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 8894 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 8895 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 8896 } 8897 8898 public: 8899 8900 /++ 8901 Whether or not `c` is a Unicode whitespace $(CHARACTER). 8902 (general Unicode category: Part of C0(tab, vertical tab, form feed, 8903 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 8904 +/ 8905 @safe pure nothrow @nogc 8906 public bool isWhite(dchar c) 8907 { 8908 import std.internal.unicode_tables : isWhiteGen; // generated file 8909 return isWhiteGen(c); // call pregenerated binary search 8910 } 8911 8912 /++ 8913 Return whether `c` is a Unicode lowercase $(CHARACTER). 8914 +/ 8915 @safe pure nothrow @nogc 8916 bool isLower(dchar c) 8917 { 8918 import std.ascii : isLower, isASCII; 8919 if (isASCII(c)) 8920 return isLower(c); 8921 return lowerCaseTrie[c]; 8922 } 8923 8924 @safe unittest 8925 { 8926 import std.ascii : isLower; 8927 foreach (v; 0 .. 0x80) 8928 assert(isLower(v) == .isLower(v)); 8929 assert(.isLower('я')); 8930 assert(.isLower('й')); 8931 assert(!.isLower('Ж')); 8932 // Greek HETA 8933 assert(!.isLower('\u0370')); 8934 assert(.isLower('\u0371')); 8935 assert(!.isLower('\u039C')); // capital MU 8936 assert(.isLower('\u03B2')); // beta 8937 // from extended Greek 8938 assert(!.isLower('\u1F18')); 8939 assert(.isLower('\u1F00')); 8940 foreach (v; unicode.lowerCase.byCodepoint) 8941 assert(.isLower(v) && !isUpper(v)); 8942 } 8943 8944 8945 /++ 8946 Return whether `c` is a Unicode uppercase $(CHARACTER). 8947 +/ 8948 @safe pure nothrow @nogc 8949 bool isUpper(dchar c) 8950 { 8951 import std.ascii : isUpper, isASCII; 8952 if (isASCII(c)) 8953 return isUpper(c); 8954 return upperCaseTrie[c]; 8955 } 8956 8957 @safe unittest 8958 { 8959 import std.ascii : isLower; 8960 foreach (v; 0 .. 0x80) 8961 assert(isLower(v) == .isLower(v)); 8962 assert(!isUpper('й')); 8963 assert(isUpper('Ж')); 8964 // Greek HETA 8965 assert(isUpper('\u0370')); 8966 assert(!isUpper('\u0371')); 8967 assert(isUpper('\u039C')); // capital MU 8968 assert(!isUpper('\u03B2')); // beta 8969 // from extended Greek 8970 assert(!isUpper('\u1F00')); 8971 assert(isUpper('\u1F18')); 8972 foreach (v; unicode.upperCase.byCodepoint) 8973 assert(isUpper(v) && !.isLower(v)); 8974 } 8975 8976 8977 //TODO: Hidden for now, needs better API. 8978 //Other transforms could use better API as well, but this one is a new primitive. 8979 @safe pure nothrow @nogc 8980 private dchar toTitlecase(dchar c) 8981 { 8982 // optimize ASCII case 8983 if (c < 0xAA) 8984 { 8985 if (c < 'a') 8986 return c; 8987 if (c <= 'z') 8988 return c - 32; 8989 return c; 8990 } 8991 size_t idx = toTitleSimpleIndex(c); 8992 if (idx != ushort.max) 8993 { 8994 return toTitleTab(idx); 8995 } 8996 return c; 8997 } 8998 8999 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9000 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9001 9002 // generic toUpper/toLower on whole string, creates new or returns as is 9003 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9004 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9005 { 9006 import std.array : appender, array; 9007 import std.ascii : isASCII; 9008 import std.utf : byDchar, codeLength; 9009 9010 alias C = ElementEncodingType!S; 9011 9012 auto r = s.byDchar; 9013 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9014 { 9015 auto cOuter = r.front; 9016 ushort idx = indexFn(cOuter); 9017 if (idx == ushort.max) 9018 continue; 9019 auto result = appender!(C[])(); 9020 result.reserve(s.length); 9021 result.put(s[0 .. i]); 9022 foreach (dchar c; s[i .. $].byDchar) 9023 { 9024 if (c.isASCII) 9025 { 9026 result.put(asciiConvert(c)); 9027 } 9028 else 9029 { 9030 idx = indexFn(c); 9031 if (idx == ushort.max) 9032 result.put(c); 9033 else if (idx < maxIdx) 9034 { 9035 c = tableFn(idx); 9036 result.put(c); 9037 } 9038 else 9039 { 9040 auto val = tableFn(idx); 9041 // unpack length + codepoint 9042 immutable uint len = val >> 24; 9043 result.put(cast(dchar)(val & 0xFF_FFFF)); 9044 foreach (j; idx+1 .. idx+len) 9045 result.put(tableFn(j)); 9046 } 9047 } 9048 } 9049 return result.data; 9050 } 9051 9052 static if (isSomeString!S) 9053 return s; 9054 else 9055 return s.array; 9056 } 9057 9058 // https://issues.dlang.org/show_bug.cgi?id=12428 9059 @safe unittest 9060 { 9061 import std.array : replicate; 9062 auto s = "abcdefghij".replicate(300); 9063 s = s[0 .. 10]; 9064 9065 toUpper(s); 9066 9067 assert(s == "abcdefghij"); 9068 } 9069 9070 // https://issues.dlang.org/show_bug.cgi?id=18993 9071 @safe unittest 9072 { 9073 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9074 } 9075 9076 9077 // generic toUpper/toLower on whole range, returns range 9078 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9079 // Accept range of dchar's 9080 if (isInputRange!Range && 9081 isSomeChar!(ElementEncodingType!Range) && 9082 ElementEncodingType!Range.sizeof == dchar.sizeof) 9083 { 9084 static struct ToCaserImpl 9085 { 9086 @property bool empty() 9087 { 9088 return !nLeft && r.empty; 9089 } 9090 9091 @property auto front() 9092 { 9093 import std.ascii : isASCII; 9094 9095 if (!nLeft) 9096 { 9097 dchar c = r.front; 9098 if (c.isASCII) 9099 { 9100 buf[0] = asciiConvert(c); 9101 nLeft = 1; 9102 } 9103 else 9104 { 9105 const idx = indexFn(c); 9106 if (idx == ushort.max) 9107 { 9108 buf[0] = c; 9109 nLeft = 1; 9110 } 9111 else if (idx < maxIdx) 9112 { 9113 buf[0] = tableFn(idx); 9114 nLeft = 1; 9115 } 9116 else 9117 { 9118 immutable val = tableFn(idx); 9119 // unpack length + codepoint 9120 nLeft = val >> 24; 9121 if (nLeft == 0) 9122 nLeft = 1; 9123 assert(nLeft <= buf.length); 9124 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9125 foreach (j; 1 .. nLeft) 9126 buf[nLeft - j - 1] = tableFn(idx + j); 9127 } 9128 } 9129 } 9130 return buf[nLeft - 1]; 9131 } 9132 9133 void popFront() 9134 { 9135 if (!nLeft) 9136 front; 9137 assert(nLeft); 9138 --nLeft; 9139 if (!nLeft) 9140 r.popFront(); 9141 } 9142 9143 static if (isForwardRange!Range) 9144 { 9145 @property auto save() 9146 { 9147 auto ret = this; 9148 ret.r = r.save; 9149 return ret; 9150 } 9151 } 9152 9153 private: 9154 Range r; 9155 uint nLeft; 9156 dchar[3] buf = void; 9157 } 9158 9159 return ToCaserImpl(str); 9160 } 9161 9162 /********************* 9163 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9164 * or a string to upper or lower case. 9165 * 9166 * Does not allocate memory. 9167 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9168 * are treated as $(REF replacementDchar, std,utf). 9169 * 9170 * Params: 9171 * str = string or range of characters 9172 * 9173 * Returns: 9174 * an input range of `dchar`s 9175 * 9176 * See_Also: 9177 * $(LREF toUpper), $(LREF toLower) 9178 */ 9179 9180 auto asLowerCase(Range)(Range str) 9181 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9182 !isConvertibleToString!Range) 9183 { 9184 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9185 { 9186 import std.utf : byDchar; 9187 9188 // Decode first 9189 return asLowerCase(str.byDchar); 9190 } 9191 else 9192 { 9193 static import std.ascii; 9194 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9195 } 9196 } 9197 9198 /// ditto 9199 auto asUpperCase(Range)(Range str) 9200 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9201 !isConvertibleToString!Range) 9202 { 9203 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9204 { 9205 import std.utf : byDchar; 9206 9207 // Decode first 9208 return asUpperCase(str.byDchar); 9209 } 9210 else 9211 { 9212 static import std.ascii; 9213 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9214 } 9215 } 9216 9217 /// 9218 @safe pure unittest 9219 { 9220 import std.algorithm.comparison : equal; 9221 9222 assert("hEllo".asUpperCase.equal("HELLO")); 9223 } 9224 9225 // explicitly undocumented 9226 auto asLowerCase(Range)(auto ref Range str) 9227 if (isConvertibleToString!Range) 9228 { 9229 import std.traits : StringTypeOf; 9230 return asLowerCase!(StringTypeOf!Range)(str); 9231 } 9232 9233 // explicitly undocumented 9234 auto asUpperCase(Range)(auto ref Range str) 9235 if (isConvertibleToString!Range) 9236 { 9237 import std.traits : StringTypeOf; 9238 return asUpperCase!(StringTypeOf!Range)(str); 9239 } 9240 9241 @safe unittest 9242 { 9243 static struct TestAliasedString 9244 { 9245 string get() @safe @nogc pure nothrow { return _s; } 9246 alias get this; 9247 @disable this(this); 9248 string _s; 9249 } 9250 9251 static bool testAliasedString(alias func, Args...)(string s, Args args) 9252 { 9253 import std.algorithm.comparison : equal; 9254 auto a = func(TestAliasedString(s), args); 9255 auto b = func(s, args); 9256 static if (is(typeof(equal(a, b)))) 9257 { 9258 // For ranges, compare contents instead of object identity. 9259 return equal(a, b); 9260 } 9261 else 9262 { 9263 return a == b; 9264 } 9265 } 9266 assert(testAliasedString!asLowerCase("hEllo")); 9267 assert(testAliasedString!asUpperCase("hEllo")); 9268 assert(testAliasedString!asCapitalized("hEllo")); 9269 } 9270 9271 @safe unittest 9272 { 9273 import std.array : array; 9274 9275 auto a = "HELLo".asLowerCase; 9276 auto savea = a.save; 9277 auto s = a.array; 9278 assert(s == "hello"); 9279 s = savea.array; 9280 assert(s == "hello"); 9281 9282 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9283 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9284 9285 foreach (i, slwr; lower) 9286 { 9287 import std.utf : byChar; 9288 9289 auto sx = slwr.asUpperCase.byChar.array; 9290 assert(sx == toUpper(slwr)); 9291 auto sy = upper[i].asLowerCase.byChar.array; 9292 assert(sy == toLower(upper[i])); 9293 } 9294 9295 // Not necessary to call r.front 9296 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9297 { 9298 } 9299 9300 import std.algorithm.comparison : equal; 9301 9302 "HELLo"w.asLowerCase.equal("hello"d); 9303 "HELLo"w.asUpperCase.equal("HELLO"d); 9304 "HELLo"d.asLowerCase.equal("hello"d); 9305 "HELLo"d.asUpperCase.equal("HELLO"d); 9306 9307 import std.utf : byChar; 9308 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9309 } 9310 9311 // generic capitalizer on whole range, returns range 9312 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9313 Range)(Range str) 9314 // Accept range of dchar's 9315 if (isInputRange!Range && 9316 isSomeChar!(ElementEncodingType!Range) && 9317 ElementEncodingType!Range.sizeof == dchar.sizeof) 9318 { 9319 static struct ToCapitalizerImpl 9320 { 9321 @property bool empty() 9322 { 9323 return lower ? lwr.empty : !nLeft && r.empty; 9324 } 9325 9326 @property auto front() 9327 { 9328 if (lower) 9329 return lwr.front; 9330 9331 if (!nLeft) 9332 { 9333 immutable dchar c = r.front; 9334 const idx = indexFnUpper(c); 9335 if (idx == ushort.max) 9336 { 9337 buf[0] = c; 9338 nLeft = 1; 9339 } 9340 else if (idx < maxIdxUpper) 9341 { 9342 buf[0] = tableFnUpper(idx); 9343 nLeft = 1; 9344 } 9345 else 9346 { 9347 immutable val = tableFnUpper(idx); 9348 // unpack length + codepoint 9349 nLeft = val >> 24; 9350 if (nLeft == 0) 9351 nLeft = 1; 9352 assert(nLeft <= buf.length); 9353 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9354 foreach (j; 1 .. nLeft) 9355 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9356 } 9357 } 9358 return buf[nLeft - 1]; 9359 } 9360 9361 void popFront() 9362 { 9363 if (lower) 9364 lwr.popFront(); 9365 else 9366 { 9367 if (!nLeft) 9368 front; 9369 assert(nLeft); 9370 --nLeft; 9371 if (!nLeft) 9372 { 9373 r.popFront(); 9374 lwr = r.asLowerCase(); 9375 lower = true; 9376 } 9377 } 9378 } 9379 9380 static if (isForwardRange!Range) 9381 { 9382 @property auto save() 9383 { 9384 auto ret = this; 9385 ret.r = r.save; 9386 ret.lwr = lwr.save; 9387 return ret; 9388 } 9389 } 9390 9391 private: 9392 Range r; 9393 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9394 bool lower = false; // false for first character, true for rest of string 9395 dchar[3] buf = void; 9396 uint nLeft = 0; 9397 } 9398 9399 return ToCapitalizerImpl(str); 9400 } 9401 9402 /********************* 9403 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9404 * or string, meaning convert the first 9405 * character to upper case and subsequent characters to lower case. 9406 * 9407 * Does not allocate memory. 9408 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9409 * are treated as $(REF replacementDchar, std,utf). 9410 * 9411 * Params: 9412 * str = string or range of characters 9413 * 9414 * Returns: 9415 * an InputRange of dchars 9416 * 9417 * See_Also: 9418 * $(LREF toUpper), $(LREF toLower) 9419 * $(LREF asUpperCase), $(LREF asLowerCase) 9420 */ 9421 9422 auto asCapitalized(Range)(Range str) 9423 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9424 !isConvertibleToString!Range) 9425 { 9426 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9427 { 9428 import std.utf : byDchar; 9429 9430 // Decode first 9431 return toCapitalizer!UpperTriple(str.byDchar); 9432 } 9433 else 9434 { 9435 return toCapitalizer!UpperTriple(str); 9436 } 9437 } 9438 9439 /// 9440 @safe pure unittest 9441 { 9442 import std.algorithm.comparison : equal; 9443 9444 assert("hEllo".asCapitalized.equal("Hello")); 9445 } 9446 9447 auto asCapitalized(Range)(auto ref Range str) 9448 if (isConvertibleToString!Range) 9449 { 9450 import std.traits : StringTypeOf; 9451 return asCapitalized!(StringTypeOf!Range)(str); 9452 } 9453 9454 @safe pure nothrow @nogc unittest 9455 { 9456 auto r = "hEllo".asCapitalized(); 9457 assert(r.front == 'H'); 9458 } 9459 9460 @safe unittest 9461 { 9462 import std.array : array; 9463 9464 auto a = "hELLo".asCapitalized; 9465 auto savea = a.save; 9466 auto s = a.array; 9467 assert(s == "Hello"); 9468 s = savea.array; 9469 assert(s == "Hello"); 9470 9471 string[2][] cases = 9472 [ 9473 ["", ""], 9474 ["h", "H"], 9475 ["H", "H"], 9476 ["3", "3"], 9477 ["123", "123"], 9478 ["h123A", "H123a"], 9479 ["феж", "Феж"], 9480 ["\u1Fe2", "\u03a5\u0308\u0300"], 9481 ]; 9482 9483 foreach (i; 0 .. cases.length) 9484 { 9485 import std.utf : byChar; 9486 9487 auto r = cases[i][0].asCapitalized.byChar.array; 9488 auto result = cases[i][1]; 9489 assert(r == result); 9490 } 9491 9492 // Don't call r.front 9493 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9494 { 9495 } 9496 9497 import std.algorithm.comparison : equal; 9498 9499 "HELLo"w.asCapitalized.equal("Hello"d); 9500 "hElLO"w.asCapitalized.equal("Hello"d); 9501 "hello"d.asCapitalized.equal("Hello"d); 9502 "HELLO"d.asCapitalized.equal("Hello"d); 9503 9504 import std.utf : byChar; 9505 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9506 } 9507 9508 // TODO: helper, I wish std.utf was more flexible (and stright) 9509 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9510 { 9511 if (c <= 0x7F) 9512 { 9513 buf[idx] = cast(char) c; 9514 idx++; 9515 } 9516 else if (c <= 0x7FF) 9517 { 9518 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9519 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9520 idx += 2; 9521 } 9522 else if (c <= 0xFFFF) 9523 { 9524 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9525 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9526 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9527 idx += 3; 9528 } 9529 else if (c <= 0x10FFFF) 9530 { 9531 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9532 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9533 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9534 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9535 idx += 4; 9536 } 9537 else 9538 assert(0); 9539 return idx; 9540 } 9541 9542 @safe unittest 9543 { 9544 char[] s = "abcd".dup; 9545 size_t i = 0; 9546 i = encodeTo(s, i, 'X'); 9547 assert(s == "Xbcd"); 9548 9549 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9550 assert(s == "X\xC2\xA9d"); 9551 } 9552 9553 // TODO: helper, I wish std.utf was more flexible (and stright) 9554 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9555 { 9556 import std.utf : UTFException; 9557 if (c <= 0xFFFF) 9558 { 9559 if (0xD800 <= c && c <= 0xDFFF) 9560 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9561 buf[idx] = cast(wchar) c; 9562 idx++; 9563 } 9564 else if (c <= 0x10FFFF) 9565 { 9566 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9567 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9568 idx += 2; 9569 } 9570 else 9571 assert(0); 9572 return idx; 9573 } 9574 9575 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9576 { 9577 buf[idx] = c; 9578 idx++; 9579 return idx; 9580 } 9581 9582 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9583 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9584 { 9585 import std.utf : decode, codeLength; 9586 size_t curIdx = 0; 9587 size_t destIdx = 0; 9588 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9589 size_t lastUnchanged = 0; 9590 // in-buffer move of bytes to a new start index 9591 // the trick is that it may not need to copy at all 9592 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9593 { 9594 // Interestingly we may just bump pointer for a while 9595 // then have to copy if a re-cased char was smaller the original 9596 // later we may regain pace with char that got bigger 9597 // In the end it sometimes flip-flops between the 2 cases below 9598 if (dest == from) 9599 return to; 9600 // got to copy 9601 foreach (C c; str[from .. to]) 9602 str[dest++] = c; 9603 return dest; 9604 } 9605 while (curIdx != s.length) 9606 { 9607 size_t startIdx = curIdx; 9608 immutable ch = decode(s, curIdx); 9609 // TODO: special case for ASCII 9610 immutable caseIndex = indexFn(ch); 9611 if (caseIndex == ushort.max) // unchanged, skip over 9612 { 9613 continue; 9614 } 9615 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9616 { 9617 // previous cased chars had the same length as uncased ones 9618 // thus can just adjust pointer 9619 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9620 lastUnchanged = curIdx; 9621 immutable cased = tableFn(caseIndex); 9622 immutable casedLen = codeLength!C(cased); 9623 if (casedLen + destIdx > curIdx) // no place to fit cased char 9624 { 9625 // switch to slow codepath, where we allocate 9626 return slowToCase(s, startIdx, destIdx); 9627 } 9628 else 9629 { 9630 destIdx = encodeTo(s, destIdx, cased); 9631 } 9632 } 9633 else // 1:m codepoint mapping, slow codepath 9634 { 9635 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9636 lastUnchanged = curIdx; 9637 return slowToCase(s, startIdx, destIdx); 9638 } 9639 assert(destIdx <= curIdx); 9640 } 9641 if (lastUnchanged != s.length) 9642 { 9643 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9644 } 9645 s = s[0 .. destIdx]; 9646 } 9647 9648 // helper to precalculate size of case-converted string 9649 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9650 { 9651 size_t toCaseLength(C)(const scope C[] str) 9652 { 9653 import std.utf : decode, codeLength; 9654 size_t codeLen = 0; 9655 size_t lastNonTrivial = 0; 9656 size_t curIdx = 0; 9657 while (curIdx != str.length) 9658 { 9659 immutable startIdx = curIdx; 9660 immutable ch = decode(str, curIdx); 9661 immutable ushort caseIndex = indexFn(ch); 9662 if (caseIndex == ushort.max) 9663 continue; 9664 else if (caseIndex < maxIdx) 9665 { 9666 codeLen += startIdx - lastNonTrivial; 9667 lastNonTrivial = curIdx; 9668 immutable cased = tableFn(caseIndex); 9669 codeLen += codeLength!C(cased); 9670 } 9671 else 9672 { 9673 codeLen += startIdx - lastNonTrivial; 9674 lastNonTrivial = curIdx; 9675 immutable val = tableFn(caseIndex); 9676 immutable len = val >> 24; 9677 immutable dchar cased = val & 0xFF_FFFF; 9678 codeLen += codeLength!C(cased); 9679 foreach (j; caseIndex+1 .. caseIndex+len) 9680 codeLen += codeLength!C(tableFn(j)); 9681 } 9682 } 9683 if (lastNonTrivial != str.length) 9684 codeLen += str.length - lastNonTrivial; 9685 return codeLen; 9686 } 9687 } 9688 9689 @safe unittest 9690 { 9691 alias toLowerLength = toCaseLength!(LowerTriple); 9692 assert(toLowerLength("abcd") == 4); 9693 assert(toLowerLength("аБВгд456") == 10+3); 9694 } 9695 9696 // slower code path that preallocates and then copies 9697 // case-converted stuf to the new string 9698 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9699 { 9700 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9701 size_t destIdx) @trusted pure 9702 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9703 { 9704 import std.utf : decode; 9705 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9706 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9707 C[] ns = new C[trueLength]; 9708 ns[0 .. destIdx] = s[0 .. destIdx]; 9709 size_t lastUnchanged = curIdx; 9710 while (curIdx != s.length) 9711 { 9712 immutable startIdx = curIdx; // start of current codepoint 9713 immutable ch = decode(s, curIdx); 9714 immutable caseIndex = indexFn(ch); 9715 if (caseIndex == ushort.max) // skip over 9716 { 9717 continue; 9718 } 9719 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9720 { 9721 immutable cased = tableFn(caseIndex); 9722 auto toCopy = startIdx - lastUnchanged; 9723 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9724 lastUnchanged = curIdx; 9725 destIdx += toCopy; 9726 destIdx = encodeTo(ns, destIdx, cased); 9727 } 9728 else // 1:m codepoint mapping, slow codepath 9729 { 9730 auto toCopy = startIdx - lastUnchanged; 9731 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9732 lastUnchanged = curIdx; 9733 destIdx += toCopy; 9734 auto val = tableFn(caseIndex); 9735 // unpack length + codepoint 9736 immutable uint len = val >> 24; 9737 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9738 foreach (j; caseIndex+1 .. caseIndex+len) 9739 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9740 } 9741 } 9742 if (lastUnchanged != s.length) 9743 { 9744 auto toCopy = s.length - lastUnchanged; 9745 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9746 destIdx += toCopy; 9747 } 9748 assert(ns.length == destIdx); 9749 s = ns; 9750 } 9751 } 9752 9753 /++ 9754 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9755 For a few characters string length may increase after the transformation, 9756 in such a case the function reallocates exactly once. 9757 If `s` does not have any uppercase characters, then `s` is unaltered. 9758 +/ 9759 void toLowerInPlace(C)(ref C[] s) @trusted pure 9760 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9761 { 9762 toCaseInPlace!(LowerTriple)(s); 9763 } 9764 // overloads for the most common cases to reduce compile time 9765 @safe pure /*TODO nothrow*/ 9766 { 9767 void toLowerInPlace(ref char[] s) 9768 { toLowerInPlace!char(s); } 9769 void toLowerInPlace(ref wchar[] s) 9770 { toLowerInPlace!wchar(s); } 9771 void toLowerInPlace(ref dchar[] s) 9772 { toLowerInPlace!dchar(s); } 9773 } 9774 9775 /++ 9776 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9777 For a few characters string length may increase after the transformation, 9778 in such a case the function reallocates exactly once. 9779 If `s` does not have any lowercase characters, then `s` is unaltered. 9780 +/ 9781 void toUpperInPlace(C)(ref C[] s) @trusted pure 9782 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9783 { 9784 toCaseInPlace!(UpperTriple)(s); 9785 } 9786 // overloads for the most common cases to reduce compile time/code size 9787 @safe pure /*TODO nothrow*/ 9788 { 9789 void toUpperInPlace(ref char[] s) 9790 { toUpperInPlace!char(s); } 9791 void toUpperInPlace(ref wchar[] s) 9792 { toUpperInPlace!wchar(s); } 9793 void toUpperInPlace(ref dchar[] s) 9794 { toUpperInPlace!dchar(s); } 9795 } 9796 9797 /++ 9798 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9799 is returned. Otherwise `c` is returned. 9800 9801 Warning: certain alphabets like German and Greek have no 1:1 9802 upper-lower mapping. Use overload of toLower which takes full string instead. 9803 +/ 9804 @safe pure nothrow @nogc 9805 dchar toLower(dchar c) 9806 { 9807 // optimize ASCII case 9808 if (c < 0xAA) 9809 { 9810 if (c < 'A') 9811 return c; 9812 if (c <= 'Z') 9813 return c + 32; 9814 return c; 9815 } 9816 size_t idx = toLowerSimpleIndex(c); 9817 if (idx != ushort.max) 9818 { 9819 return toLowerTab(idx); 9820 } 9821 return c; 9822 } 9823 9824 /++ 9825 Creates a new array which is identical to `s` except that all of its 9826 characters are converted to lowercase (by preforming Unicode lowercase mapping). 9827 If none of `s` characters were affected, then `s` itself is returned if `s` is a 9828 `string`-like type. 9829 9830 Params: 9831 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 9832 of characters 9833 Returns: 9834 An array with the same element type as `s`. 9835 +/ 9836 ElementEncodingType!S[] toLower(S)(S s) 9837 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9838 { 9839 static import std.ascii; 9840 9841 static if (isSomeString!S) 9842 return () @trusted { return toCase!(LowerTriple, std.ascii.toLower)(s); } (); 9843 else 9844 return toCase!(LowerTriple, std.ascii.toLower)(s); 9845 } 9846 9847 // overloads for the most common cases to reduce compile time 9848 @safe pure /*TODO nothrow*/ 9849 { 9850 string toLower(string s) 9851 { return toLower!string(s); } 9852 wstring toLower(wstring s) 9853 { return toLower!wstring(s); } 9854 dstring toLower(dstring s) 9855 { return toLower!dstring(s); } 9856 9857 @safe unittest 9858 { 9859 // https://issues.dlang.org/show_bug.cgi?id=16663 9860 9861 static struct String 9862 { 9863 string data; 9864 alias data this; 9865 } 9866 9867 void foo() 9868 { 9869 auto u = toLower(String("")); 9870 } 9871 } 9872 } 9873 9874 9875 @safe unittest 9876 { 9877 static import std.ascii; 9878 import std.format : format; 9879 foreach (ch; 0 .. 0x80) 9880 assert(std.ascii.toLower(ch) == toLower(ch)); 9881 assert(toLower('Я') == 'я'); 9882 assert(toLower('Δ') == 'δ'); 9883 foreach (ch; unicode.upperCase.byCodepoint) 9884 { 9885 dchar low = ch.toLower(); 9886 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 9887 } 9888 assert(toLower("АЯ") == "ая"); 9889 9890 assert("\u1E9E".toLower == "\u00df"); 9891 assert("\u00df".toUpper == "SS"); 9892 } 9893 9894 // https://issues.dlang.org/show_bug.cgi?id=9629 9895 @safe unittest 9896 { 9897 wchar[] test = "hello þ world"w.dup; 9898 auto piece = test[6 .. 7]; 9899 toUpperInPlace(piece); 9900 assert(test == "hello Þ world"); 9901 } 9902 9903 9904 @safe unittest 9905 { 9906 import std.algorithm.comparison : cmp; 9907 string s1 = "FoL"; 9908 string s2 = toLower(s1); 9909 assert(cmp(s2, "fol") == 0, s2); 9910 assert(s2 != s1); 9911 9912 char[] s3 = s1.dup; 9913 toLowerInPlace(s3); 9914 assert(s3 == s2); 9915 9916 s1 = "A\u0100B\u0101d"; 9917 s2 = toLower(s1); 9918 s3 = s1.dup; 9919 assert(cmp(s2, "a\u0101b\u0101d") == 0); 9920 assert(s2 !is s1); 9921 toLowerInPlace(s3); 9922 assert(s3 == s2); 9923 9924 s1 = "A\u0460B\u0461d"; 9925 s2 = toLower(s1); 9926 s3 = s1.dup; 9927 assert(cmp(s2, "a\u0461b\u0461d") == 0); 9928 assert(s2 !is s1); 9929 toLowerInPlace(s3); 9930 assert(s3 == s2); 9931 9932 s1 = "\u0130"; 9933 s2 = toLower(s1); 9934 s3 = s1.dup; 9935 assert(s2 == "i\u0307"); 9936 assert(s2 !is s1); 9937 toLowerInPlace(s3); 9938 assert(s3 == s2); 9939 9940 // Test on wchar and dchar strings. 9941 assert(toLower("Some String"w) == "some string"w); 9942 assert(toLower("Some String"d) == "some string"d); 9943 9944 // https://issues.dlang.org/show_bug.cgi?id=12455 9945 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 9946 assert(isUpper(c)); 9947 assert(toLower(c) == 'i'); 9948 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 9949 // check simple-case toUpper too 9950 c = '\u1f87'; 9951 assert(isLower(c)); 9952 assert(toUpper(c) == '\u1F8F'); 9953 } 9954 9955 @safe pure unittest 9956 { 9957 import std.algorithm.comparison : cmp, equal; 9958 import std.utf : byCodeUnit; 9959 auto r1 = "FoL".byCodeUnit; 9960 assert(r1.toLower.cmp("fol") == 0); 9961 auto r2 = "A\u0460B\u0461d".byCodeUnit; 9962 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 9963 } 9964 9965 /++ 9966 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 9967 is returned. Otherwise `c` is returned. 9968 9969 Warning: 9970 Certain alphabets like German and Greek have no 1:1 9971 upper-lower mapping. Use overload of toUpper which takes full string instead. 9972 9973 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 9974 to produce an algorithm that can convert a range of characters to upper case 9975 without allocating memory. 9976 A string can then be produced by using $(REF copy, std,algorithm,mutation) 9977 to send it to an $(REF appender, std,array). 9978 +/ 9979 @safe pure nothrow @nogc 9980 dchar toUpper(dchar c) 9981 { 9982 // optimize ASCII case 9983 if (c < 0xAA) 9984 { 9985 if (c < 'a') 9986 return c; 9987 if (c <= 'z') 9988 return c - 32; 9989 return c; 9990 } 9991 size_t idx = toUpperSimpleIndex(c); 9992 if (idx != ushort.max) 9993 { 9994 return toUpperTab(idx); 9995 } 9996 return c; 9997 } 9998 9999 /// 10000 @safe unittest 10001 { 10002 import std.algorithm.iteration : map; 10003 import std.algorithm.mutation : copy; 10004 import std.array : appender; 10005 10006 auto abuf = appender!(char[])(); 10007 "hello".map!toUpper.copy(abuf); 10008 assert(abuf.data == "HELLO"); 10009 } 10010 10011 @safe unittest 10012 { 10013 static import std.ascii; 10014 import std.format : format; 10015 foreach (ch; 0 .. 0x80) 10016 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10017 assert(toUpper('я') == 'Я'); 10018 assert(toUpper('δ') == 'Δ'); 10019 auto title = unicode.Titlecase_Letter; 10020 foreach (ch; unicode.lowerCase.byCodepoint) 10021 { 10022 dchar up = ch.toUpper(); 10023 assert(up == ch || isUpper(up) || title[up], 10024 format("%x -> %x", ch, up)); 10025 } 10026 } 10027 10028 /++ 10029 Allocates a new array which is identical to `s` except that all of its 10030 characters are converted to uppercase (by preforming Unicode uppercase mapping). 10031 If none of `s` characters were affected, then `s` itself is returned if `s` 10032 is a `string`-like type. 10033 10034 Params: 10035 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10036 of characters 10037 Returns: 10038 An new array with the same element type as `s`. 10039 +/ 10040 ElementEncodingType!S[] toUpper(S)(S s) 10041 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10042 { 10043 static import std.ascii; 10044 10045 static if (isSomeString!S) 10046 return () @trusted { return toCase!(UpperTriple, std.ascii.toUpper)(s); } (); 10047 else 10048 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10049 } 10050 10051 // overloads for the most common cases to reduce compile time 10052 @safe pure /*TODO nothrow*/ 10053 { 10054 string toUpper(string s) 10055 { return toUpper!string(s); } 10056 wstring toUpper(wstring s) 10057 { return toUpper!wstring(s); } 10058 dstring toUpper(dstring s) 10059 { return toUpper!dstring(s); } 10060 10061 @safe unittest 10062 { 10063 // https://issues.dlang.org/show_bug.cgi?id=16663 10064 10065 static struct String 10066 { 10067 string data; 10068 alias data this; 10069 } 10070 10071 void foo() 10072 { 10073 auto u = toUpper(String("")); 10074 } 10075 } 10076 } 10077 10078 @safe unittest 10079 { 10080 import std.algorithm.comparison : cmp; 10081 10082 string s1 = "FoL"; 10083 string s2; 10084 char[] s3; 10085 10086 s2 = toUpper(s1); 10087 s3 = s1.dup; toUpperInPlace(s3); 10088 assert(s3 == s2, s3); 10089 assert(cmp(s2, "FOL") == 0); 10090 assert(s2 !is s1); 10091 10092 s1 = "a\u0100B\u0101d"; 10093 s2 = toUpper(s1); 10094 s3 = s1.dup; toUpperInPlace(s3); 10095 assert(s3 == s2); 10096 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10097 assert(s2 !is s1); 10098 10099 s1 = "a\u0460B\u0461d"; 10100 s2 = toUpper(s1); 10101 s3 = s1.dup; toUpperInPlace(s3); 10102 assert(s3 == s2); 10103 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10104 assert(s2 !is s1); 10105 } 10106 10107 @system unittest 10108 { 10109 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10110 { 10111 import std.format : format; 10112 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10113 auto low = s.toLower() , up = s.toUpper(); 10114 auto lowInp = s.dup, upInp = s.dup; 10115 lowInp.toLowerInPlace(); 10116 upInp.toUpperInPlace(); 10117 assert(low == trueLow, format(diff, low, trueLow)); 10118 assert(up == trueUp, format(diff, up, trueUp)); 10119 assert(lowInp == trueLow, 10120 format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow)); 10121 assert(upInp == trueUp, 10122 format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp)); 10123 } 10124 static foreach (S; AliasSeq!(dstring, wstring, string)) 10125 {{ 10126 10127 S easy = "123"; 10128 S good = "abCФеж"; 10129 S awful = "\u0131\u023f\u2126"; 10130 S wicked = "\u0130\u1FE2"; 10131 auto options = [easy, good, awful, wicked]; 10132 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10133 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10134 10135 foreach (val; [easy, good]) 10136 { 10137 auto e = val.dup; 10138 auto g = e; 10139 e.toUpperInPlace(); 10140 assert(e is g); 10141 e.toLowerInPlace(); 10142 assert(e is g); 10143 } 10144 foreach (i, v; options) 10145 { 10146 doTest(v, upper[i], lower[i]); 10147 } 10148 10149 // a few combinatorial runs 10150 foreach (i; 0 .. options.length) 10151 foreach (j; i .. options.length) 10152 foreach (k; j .. options.length) 10153 { 10154 auto sample = options[i] ~ options[j] ~ options[k]; 10155 auto sample2 = options[k] ~ options[j] ~ options[i]; 10156 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10157 lower[i] ~ lower[j] ~ lower[k]); 10158 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10159 lower[k] ~ lower[j] ~ lower[i]); 10160 } 10161 }} 10162 } 10163 10164 // test random access ranges 10165 @safe pure unittest 10166 { 10167 import std.algorithm.comparison : cmp; 10168 import std.utf : byCodeUnit; 10169 auto s1 = "FoL".byCodeUnit; 10170 assert(s1.toUpper.cmp("FOL") == 0); 10171 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10172 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10173 } 10174 10175 /++ 10176 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10177 (general Unicode category: Alphabetic). 10178 +/ 10179 @safe pure nothrow @nogc 10180 bool isAlpha(dchar c) 10181 { 10182 // optimization 10183 if (c < 0xAA) 10184 { 10185 size_t x = c - 'A'; 10186 if (x <= 'Z' - 'A') 10187 return true; 10188 else 10189 { 10190 x = c - 'a'; 10191 if (x <= 'z'-'a') 10192 return true; 10193 } 10194 return false; 10195 } 10196 10197 return alphaTrie[c]; 10198 } 10199 10200 @safe unittest 10201 { 10202 auto alpha = unicode("Alphabetic"); 10203 foreach (ch; alpha.byCodepoint) 10204 assert(isAlpha(ch)); 10205 foreach (ch; 0 .. 0x4000) 10206 assert((ch in alpha) == isAlpha(ch)); 10207 } 10208 10209 10210 /++ 10211 Returns whether `c` is a Unicode mark 10212 (general Unicode category: Mn, Me, Mc). 10213 +/ 10214 @safe pure nothrow @nogc 10215 bool isMark(dchar c) 10216 { 10217 return markTrie[c]; 10218 } 10219 10220 @safe unittest 10221 { 10222 auto mark = unicode("Mark"); 10223 foreach (ch; mark.byCodepoint) 10224 assert(isMark(ch)); 10225 foreach (ch; 0 .. 0x4000) 10226 assert((ch in mark) == isMark(ch)); 10227 } 10228 10229 /++ 10230 Returns whether `c` is a Unicode numerical $(CHARACTER) 10231 (general Unicode category: Nd, Nl, No). 10232 +/ 10233 @safe pure nothrow @nogc 10234 bool isNumber(dchar c) 10235 { 10236 // optimization for ascii case 10237 if (c <= 0x7F) 10238 { 10239 return c >= '0' && c <= '9'; 10240 } 10241 else 10242 { 10243 return numberTrie[c]; 10244 } 10245 } 10246 10247 @safe unittest 10248 { 10249 auto n = unicode("N"); 10250 foreach (ch; n.byCodepoint) 10251 assert(isNumber(ch)); 10252 foreach (ch; 0 .. 0x4000) 10253 assert((ch in n) == isNumber(ch)); 10254 } 10255 10256 /++ 10257 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10258 (general Unicode category: Alphabetic, Nd, Nl, No). 10259 10260 Params: 10261 c = any Unicode character 10262 Returns: 10263 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10264 categories 10265 +/ 10266 @safe pure nothrow @nogc 10267 bool isAlphaNum(dchar c) 10268 { 10269 static import std.ascii; 10270 10271 // optimization for ascii case 10272 if (std.ascii.isASCII(c)) 10273 { 10274 return std.ascii.isAlphaNum(c); 10275 } 10276 else 10277 { 10278 return isAlpha(c) || isNumber(c); 10279 } 10280 } 10281 10282 @safe unittest 10283 { 10284 auto n = unicode("N"); 10285 auto alpha = unicode("Alphabetic"); 10286 10287 foreach (ch; n.byCodepoint) 10288 assert(isAlphaNum(ch)); 10289 10290 foreach (ch; alpha.byCodepoint) 10291 assert(isAlphaNum(ch)); 10292 10293 foreach (ch; 0 .. 0x4000) 10294 { 10295 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10296 } 10297 } 10298 10299 /++ 10300 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10301 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10302 +/ 10303 @safe pure nothrow @nogc 10304 bool isPunctuation(dchar c) 10305 { 10306 static import std.ascii; 10307 10308 // optimization for ascii case 10309 if (c <= 0x7F) 10310 { 10311 return std.ascii.isPunctuation(c); 10312 } 10313 else 10314 { 10315 return punctuationTrie[c]; 10316 } 10317 } 10318 10319 @safe unittest 10320 { 10321 assert(isPunctuation('\u0021')); 10322 assert(isPunctuation('\u0028')); 10323 assert(isPunctuation('\u0029')); 10324 assert(isPunctuation('\u002D')); 10325 assert(isPunctuation('\u005F')); 10326 assert(isPunctuation('\u00AB')); 10327 assert(isPunctuation('\u00BB')); 10328 foreach (ch; unicode("P").byCodepoint) 10329 assert(isPunctuation(ch)); 10330 } 10331 10332 /++ 10333 Returns whether `c` is a Unicode symbol $(CHARACTER) 10334 (general Unicode category: Sm, Sc, Sk, So). 10335 +/ 10336 @safe pure nothrow @nogc 10337 bool isSymbol(dchar c) 10338 { 10339 return symbolTrie[c]; 10340 } 10341 10342 @safe unittest 10343 { 10344 import std.format : format; 10345 assert(isSymbol('\u0024')); 10346 assert(isSymbol('\u002B')); 10347 assert(isSymbol('\u005E')); 10348 assert(isSymbol('\u00A6')); 10349 foreach (ch; unicode("S").byCodepoint) 10350 assert(isSymbol(ch), format("%04x", ch)); 10351 } 10352 10353 /++ 10354 Returns whether `c` is a Unicode space $(CHARACTER) 10355 (general Unicode category: Zs) 10356 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10357 For commonly used less strict semantics see $(LREF isWhite). 10358 +/ 10359 @safe pure nothrow @nogc 10360 bool isSpace(dchar c) 10361 { 10362 import std.internal.unicode_tables : isSpaceGen; // generated file 10363 return isSpaceGen(c); 10364 } 10365 10366 @safe unittest 10367 { 10368 assert(isSpace('\u0020')); 10369 auto space = unicode.Zs; 10370 foreach (ch; space.byCodepoint) 10371 assert(isSpace(ch)); 10372 foreach (ch; 0 .. 0x1000) 10373 assert(isSpace(ch) == space[ch]); 10374 } 10375 10376 10377 /++ 10378 Returns whether `c` is a Unicode graphical $(CHARACTER) 10379 (general Unicode category: L, M, N, P, S, Zs). 10380 10381 +/ 10382 @safe pure nothrow @nogc 10383 bool isGraphical(dchar c) 10384 { 10385 return graphicalTrie[c]; 10386 } 10387 10388 10389 @safe unittest 10390 { 10391 auto set = unicode("Graphical"); 10392 import std.format : format; 10393 foreach (ch; set.byCodepoint) 10394 assert(isGraphical(ch), format("%4x", ch)); 10395 foreach (ch; 0 .. 0x4000) 10396 assert((ch in set) == isGraphical(ch)); 10397 } 10398 10399 10400 /++ 10401 Returns whether `c` is a Unicode control $(CHARACTER) 10402 (general Unicode category: Cc). 10403 +/ 10404 @safe pure nothrow @nogc 10405 bool isControl(dchar c) 10406 { 10407 import std.internal.unicode_tables : isControlGen; // generated file 10408 return isControlGen(c); 10409 } 10410 10411 @safe unittest 10412 { 10413 assert(isControl('\u0000')); 10414 assert(isControl('\u0081')); 10415 assert(!isControl('\u0100')); 10416 auto cc = unicode.Cc; 10417 foreach (ch; cc.byCodepoint) 10418 assert(isControl(ch)); 10419 foreach (ch; 0 .. 0x1000) 10420 assert(isControl(ch) == cc[ch]); 10421 } 10422 10423 10424 /++ 10425 Returns whether `c` is a Unicode formatting $(CHARACTER) 10426 (general Unicode category: Cf). 10427 +/ 10428 @safe pure nothrow @nogc 10429 bool isFormat(dchar c) 10430 { 10431 import std.internal.unicode_tables : isFormatGen; // generated file 10432 return isFormatGen(c); 10433 } 10434 10435 10436 @safe unittest 10437 { 10438 assert(isFormat('\u00AD')); 10439 foreach (ch; unicode("Format").byCodepoint) 10440 assert(isFormat(ch)); 10441 } 10442 10443 // code points for private use, surrogates are not likely to change in near feature 10444 // if need be they can be generated from unicode data as well 10445 10446 /++ 10447 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10448 (general Unicode category: Co). 10449 +/ 10450 @safe pure nothrow @nogc 10451 bool isPrivateUse(dchar c) 10452 { 10453 return (0x00_E000 <= c && c <= 0x00_F8FF) 10454 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10455 || (0x10_0000 <= c && c <= 0x10_FFFD); 10456 } 10457 10458 /++ 10459 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10460 (general Unicode category: Cs). 10461 +/ 10462 @safe pure nothrow @nogc 10463 bool isSurrogate(dchar c) 10464 { 10465 return (0xD800 <= c && c <= 0xDFFF); 10466 } 10467 10468 /++ 10469 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10470 +/ 10471 @safe pure nothrow @nogc 10472 bool isSurrogateHi(dchar c) 10473 { 10474 return (0xD800 <= c && c <= 0xDBFF); 10475 } 10476 10477 /++ 10478 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10479 +/ 10480 @safe pure nothrow @nogc 10481 bool isSurrogateLo(dchar c) 10482 { 10483 return (0xDC00 <= c && c <= 0xDFFF); 10484 } 10485 10486 /++ 10487 Returns whether `c` is a Unicode non-character i.e. 10488 a $(CODEPOINT) with no assigned abstract character. 10489 (general Unicode category: Cn) 10490 +/ 10491 @safe pure nothrow @nogc 10492 bool isNonCharacter(dchar c) 10493 { 10494 return nonCharacterTrie[c]; 10495 } 10496 10497 @safe unittest 10498 { 10499 auto set = unicode("Cn"); 10500 foreach (ch; set.byCodepoint) 10501 assert(isNonCharacter(ch)); 10502 } 10503 10504 private: 10505 // load static data from pre-generated tables into usable datastructures 10506 10507 10508 @safe auto asSet(const (ubyte)[] compressed) pure 10509 { 10510 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10511 } 10512 10513 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10514 { 10515 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10516 } 10517 10518 @safe pure nothrow @nogc @property 10519 { 10520 import std.internal.unicode_tables; // generated file 10521 10522 // It's important to use auto return here, so that the compiler 10523 // only runs semantic on the return type if the function gets 10524 // used. Also these are functions rather than templates to not 10525 // increase the object size of the caller. 10526 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10527 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10528 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10529 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10530 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10531 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10532 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10533 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10534 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10535 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10536 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10537 10538 //normalization quick-check tables 10539 auto nfcQCTrie() 10540 { 10541 import std.internal.unicode_norm : nfcQCTrieEntries; 10542 static immutable res = asTrie(nfcQCTrieEntries); 10543 return res; 10544 } 10545 10546 auto nfdQCTrie() 10547 { 10548 import std.internal.unicode_norm : nfdQCTrieEntries; 10549 static immutable res = asTrie(nfdQCTrieEntries); 10550 return res; 10551 } 10552 10553 auto nfkcQCTrie() 10554 { 10555 import std.internal.unicode_norm : nfkcQCTrieEntries; 10556 static immutable res = asTrie(nfkcQCTrieEntries); 10557 return res; 10558 } 10559 10560 auto nfkdQCTrie() 10561 { 10562 import std.internal.unicode_norm : nfkdQCTrieEntries; 10563 static immutable res = asTrie(nfkdQCTrieEntries); 10564 return res; 10565 } 10566 10567 //grapheme breaking algorithm tables 10568 auto mcTrie() 10569 { 10570 import std.internal.unicode_grapheme : mcTrieEntries; 10571 static immutable res = asTrie(mcTrieEntries); 10572 return res; 10573 } 10574 10575 auto graphemeExtendTrie() 10576 { 10577 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10578 static immutable res = asTrie(graphemeExtendTrieEntries); 10579 return res; 10580 } 10581 10582 auto hangLV() 10583 { 10584 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10585 static immutable res = asTrie(hangulLVTrieEntries); 10586 return res; 10587 } 10588 10589 auto hangLVT() 10590 { 10591 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10592 static immutable res = asTrie(hangulLVTTrieEntries); 10593 return res; 10594 } 10595 10596 // tables below are used for composition/decomposition 10597 auto combiningClassTrie() 10598 { 10599 import std.internal.unicode_comp : combiningClassTrieEntries; 10600 static immutable res = asTrie(combiningClassTrieEntries); 10601 return res; 10602 } 10603 10604 auto compatMappingTrie() 10605 { 10606 import std.internal.unicode_decomp : compatMappingTrieEntries; 10607 static immutable res = asTrie(compatMappingTrieEntries); 10608 return res; 10609 } 10610 10611 auto canonMappingTrie() 10612 { 10613 import std.internal.unicode_decomp : canonMappingTrieEntries; 10614 static immutable res = asTrie(canonMappingTrieEntries); 10615 return res; 10616 } 10617 10618 auto compositionJumpTrie() 10619 { 10620 import std.internal.unicode_comp : compositionJumpTrieEntries; 10621 static immutable res = asTrie(compositionJumpTrieEntries); 10622 return res; 10623 } 10624 10625 //case conversion tables 10626 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10627 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10628 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10629 //simple case conversion tables 10630 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10631 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10632 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10633 10634 } 10635 10636 }// version (!std_uni_bootstrap)