1 // Written in the D programming language. 2 // Regular Expressions. 3 4 /** 5 * $(RED Deprecated. It will be removed in February 2012. 6 * Please use $(LINK2 std_regex.html, std.regex) instead.) 7 * 8 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular 9 * expressions) are a powerful method of string pattern matching. The 10 * regular expression language used in this library is the same as 11 * that commonly used, however, some of the very advanced forms may 12 * behave slightly differently. The standard observed is the $(WEB 13 * www.ecma-international.org/publications/standards/Ecma-262.htm, 14 * ECMA standard) for regular expressions. 15 * 16 * undead.regexp is designed to work only with valid UTF strings as input. 17 * To validate untrusted input, use std.utf.validate(). 18 * 19 * In the following guide, $(I pattern)[] refers to a 20 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression). 21 * The $(I attributes)[] refers to 22 * a string controlling the interpretation 23 * of the regular expression. 24 * It consists of a sequence of one or more 25 * of the following characters: 26 * 27 * <table border=1 cellspacing=0 cellpadding=5> 28 * <caption>Attribute Characters</caption> 29 * $(TR $(TH Attribute) $(TH Action)) 30 * <tr> 31 * $(TD $(B g)) 32 * $(TD global; repeat over the whole input string) 33 * </tr> 34 * <tr> 35 * $(TD $(B i)) 36 * $(TD case insensitive) 37 * </tr> 38 * <tr> 39 * $(TD $(B m)) 40 * $(TD treat as multiple lines separated by newlines) 41 * </tr> 42 * </table> 43 * 44 * The $(I format)[] string has the formatting characters: 45 * 46 * <table border=1 cellspacing=0 cellpadding=5> 47 * <caption>Formatting Characters</caption> 48 * $(TR $(TH Format) $(TH Replaced With)) 49 * $(TR 50 * $(TD $(B $$)) $(TD $) 51 * ) 52 * $(TR 53 * $(TD $(B $&)) $(TD The matched substring.) 54 * ) 55 * $(TR 56 * $(TD $(B $`)) $(TD The portion of string that precedes the matched substring.) 57 * ) 58 * $(TR 59 * $(TD $(B $')) $(TD The portion of string that follows the matched substring.) 60 * ) 61 * $(TR 62 * $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n) 63 * is a single digit 1-9 64 * and $$(I n) is not followed by a decimal digit.) 65 * ) 66 * $(TR 67 * $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn) 68 * is a two-digit decimal 69 * number 01-99. 70 * If $(I nn)th capture is undefined or more than the number 71 * of parenthesized subexpressions, use the empty 72 * string instead.) 73 * ) 74 * </table> 75 * 76 * Any other $ are left as is. 77 * 78 * References: 79 * $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia) 80 * Macros: 81 * WIKI = StdRegexp 82 * DOLLAR = $ 83 * 84 * Copyright: Copyright Digital Mars 2000 - 2011. 85 * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>. 86 * Authors: $(WEB digitalmars.com, Walter Bright) 87 * Source: $(PHOBOSSRC std/_regexp.d) 88 */ 89 /* Copyright Digital Mars 2000 - 2011. 90 * Distributed under the Boost Software License, Version 1.0. 91 * (See accompanying file LICENSE_1_0.txt or copy at 92 * http://www.boost.org/LICENSE_1_0.txt) 93 */ 94 95 /* 96 Escape sequences: 97 98 \nnn starts out a 1, 2 or 3 digit octal sequence, 99 where n is an octal digit. If nnn is larger than 100 0377, then the 3rd digit is not part of the sequence 101 and is not consumed. 102 For maximal portability, use exactly 3 digits. 103 104 \xXX starts out a 1 or 2 digit hex sequence. X 105 is a hex character. If the first character after the \x 106 is not a hex character, the value of the sequence is 'x' 107 and the XX are not consumed. 108 For maximal portability, use exactly 2 digits. 109 110 \uUUUU is a unicode sequence. There are exactly 111 4 hex characters after the \u, if any are not, then 112 the value of the sequence is 'u', and the UUUU are not 113 consumed. 114 115 Character classes: 116 117 [a-b], where a is greater than b, will produce 118 an error. 119 120 References: 121 122 http://www.unicode.org/unicode/reports/tr18/ 123 */ 124 125 module undead.regexp; 126 127 //pragma(msg, "Notice: As of Phobos 2.055, std.regexp has been deprecated. " ~ 128 // "It will be removed in February 2012. Please use std.regex instead."); 129 130 //debug = regexp; // uncomment to turn on debugging printf's 131 132 private 133 { 134 import core.stdc.stdio; 135 import core.stdc.stdlib; 136 import core.stdc.string; 137 import std.array; 138 import std.stdio; 139 import std.string; 140 import std.ascii; 141 import std.outbuffer; 142 import std.bitmanip; 143 import std.utf; 144 import std.algorithm; 145 import std.array; 146 import std.traits; 147 } 148 149 //deprecated: 150 151 /** Regular expression to extract an _email address. 152 * References: 153 * $(LINK2 http://www.regular-expressions.info/email.html, How to Find or Validate an Email Address)$(BR) 154 * $(LINK2 http://tools.ietf.org/html/rfc2822#section-3.4.1, RFC 2822 Internet Message Format) 155 */ 156 string email = 157 r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}"; 158 159 /** Regular expression to extract a _url */ 160 string url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?"; 161 162 /************************************ 163 * One of these gets thrown on compilation errors 164 */ 165 166 class RegExpException : Exception 167 { 168 this(string msg) 169 { 170 super(msg); 171 } 172 } 173 174 struct regmatch_t 175 { 176 sizediff_t rm_so; // index of start of match 177 sizediff_t rm_eo; // index past end of match 178 } 179 180 private alias char rchar; // so we can make a wchar version 181 182 /****************************************************** 183 * Search string for matches with regular expression 184 * pattern with attributes. 185 * Replace each match with string generated from format. 186 * Params: 187 * s = String to search. 188 * pattern = Regular expression pattern. 189 * format = Replacement string format. 190 * attributes = Regular expression attributes. 191 * Returns: 192 * the resulting string 193 * Example: 194 * Replace the letters 'a' with the letters 'ZZ'. 195 * --- 196 * s = "Strap a rocket engine on a chicken." 197 * sub(s, "a", "ZZ") // result: StrZZp a rocket engine on a chicken. 198 * sub(s, "a", "ZZ", "g") // result: StrZZp ZZ rocket engine on ZZ chicken. 199 * --- 200 * The replacement format can reference the matches using 201 * the $&, $$, $', $`, $0 .. $99 notation: 202 * --- 203 * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi 204 * --- 205 */ 206 207 string sub(string s, string pattern, string format, string attributes = null) 208 { 209 auto r = new RegExp(pattern, attributes); 210 auto result = r.replace(s, format); 211 r.destroy(); 212 return result; 213 } 214 215 unittest 216 { 217 debug(regexp) printf("regexp.sub.unittest\n"); 218 219 string r = sub("hello", "ll", "ss"); 220 assert(r == "hesso"); 221 } 222 223 /******************************************************* 224 * Search string for matches with regular expression 225 * pattern with attributes. 226 * Pass each match to delegate dg. 227 * Replace each match with the return value from dg. 228 * Params: 229 * s = String to search. 230 * pattern = Regular expression pattern. 231 * dg = Delegate 232 * attributes = Regular expression attributes. 233 * Returns: the resulting string. 234 * Example: 235 * Capitalize the letters 'a' and 'r': 236 * --- 237 * s = "Strap a rocket engine on a chicken."; 238 * sub(s, "[ar]", 239 * delegate char[] (RegExp m) 240 * { 241 * return toUpper(m[0]); 242 * }, 243 * "g"); // result: StRAp A Rocket engine on A chicken. 244 * --- 245 */ 246 247 string sub(string s, string pattern, string delegate(RegExp) dg, string attributes = null) 248 { 249 auto r = new RegExp(pattern, attributes); 250 251 string result = s; 252 size_t lastindex = 0; 253 size_t offset = 0; 254 255 while (r.test(s, lastindex)) 256 { 257 auto so = r.pmatch[0].rm_so; 258 auto eo = r.pmatch[0].rm_eo; 259 260 string replacement = dg(r); 261 262 // Optimize by using std.string.replace if possible - Dave Fladebo 263 string slice = result[offset + so .. offset + eo]; 264 if (r.attributes & RegExp.REA.global && // global, so replace all 265 !(r.attributes & RegExp.REA.ignoreCase) && // not ignoring case 266 !(r.attributes & RegExp.REA.multiline) && // not multiline 267 pattern == slice) // simple pattern (exact match, no special characters) 268 { 269 debug(regexp) 270 printf("result: %.*s, pattern: %.*s, slice: %.*s, replacement: %.*s\n", 271 result.length, result.ptr, 272 pattern.length, pattern.ptr, 273 slice.length, slice.ptr, 274 replacement.length, replacement.ptr); 275 result = replace(result,slice,replacement); 276 break; 277 } 278 279 result = replaceSlice(result, result[offset + so .. offset + eo], replacement); 280 281 if (r.attributes & RegExp.REA.global) 282 { 283 offset += replacement.length - (eo - so); 284 285 if (lastindex == eo) 286 lastindex++; // always consume some source 287 else 288 lastindex = eo; 289 } 290 else 291 break; 292 } 293 r.destroy(); 294 295 return result; 296 } 297 298 unittest 299 { 300 debug(regexp) printf("regexp.sub.unittest\n"); 301 302 string foo(RegExp r) { return "ss"; } 303 304 auto r = sub("hello", "ll", delegate string(RegExp r) { return "ss"; }); 305 assert(r == "hesso"); 306 307 r = sub("hello", "l", delegate string(RegExp r) { return "l"; }, "g"); 308 assert(r == "hello"); 309 310 auto s = sub("Strap a rocket engine on a chicken.", 311 "[ar]", 312 delegate string (RegExp m) 313 { 314 return std..string.toUpper(m[0]); 315 }, 316 "g"); 317 assert(s == "StRAp A Rocket engine on A chicken."); 318 } 319 320 321 /************************************************* 322 * Search $(D_PARAM s[]) for first match with $(D_PARAM pattern). 323 * Params: 324 * s = String to search. 325 * pattern = Regular expression pattern. 326 * Returns: 327 * index into s[] of match if found, -1 if no match. 328 * Example: 329 * --- 330 * auto s = "abcabcabab"; 331 * find(s, RegExp("b")); // match, returns 1 332 * find(s, RegExp("f")); // no match, returns -1 333 * --- 334 */ 335 336 sizediff_t find(string s, RegExp pattern) 337 { 338 return pattern.test(s) 339 ? pattern.pmatch[0].rm_so 340 : -1; 341 } 342 343 unittest 344 { 345 debug(regexp) printf("regexp.find.unittest\n"); 346 347 auto i = find("xabcy", RegExp("abc")); 348 assert(i == 1); 349 i = find("cba", RegExp("abc")); 350 assert(i == -1); 351 } 352 353 /** 354 Returns: 355 356 Same as $(D_PARAM find(s, RegExp(pattern, attributes))). 357 358 WARNING: 359 360 This function is scheduled for deprecation due to unnecessary 361 ambiguity with the homonym function in std.string. Instead of 362 $(D_PARAM undead.regexp.find(s, p, a)), you may want to use $(D_PARAM 363 find(s, RegExp(p, a))). 364 */ 365 366 sizediff_t 367 find(string s, string pattern, string attributes = null) 368 { 369 auto r = new RegExp(pattern, attributes); 370 scope(exit) r.destroy(); 371 return r.test(s) ? r.pmatch[0].rm_so : -1; 372 } 373 374 unittest 375 { 376 debug(regexp) printf("regexp.find.unittest\n"); 377 378 auto i = find("xabcy", "abc"); 379 assert(i == 1); 380 i = find("cba", "abc"); 381 assert(i == -1); 382 } 383 384 /************************************************* 385 * Search $(D_PARAM s[]) for last match with $(D_PARAM pattern). 386 * Params: 387 * s = String to search. 388 * pattern = Regular expression pattern. 389 * Returns: 390 * index into s[] of match if found, -1 if no match. 391 * Example: 392 * --- 393 * auto s = "abcabcabab"; 394 * rfind(s, RegExp("b")); // match, returns 9 395 * rfind(s, RegExp("f")); // no match, returns -1 396 * --- 397 */ 398 399 sizediff_t rfind(string s, RegExp pattern) 400 { 401 sizediff_t i = -1, lastindex = 0; 402 403 while (pattern.test(s, lastindex)) 404 { 405 auto eo = pattern.pmatch[0].rm_eo; 406 i = pattern.pmatch[0].rm_so; 407 if (lastindex == eo) 408 lastindex++; // always consume some source 409 else 410 lastindex = eo; 411 } 412 return i; 413 } 414 415 unittest 416 { 417 sizediff_t i; 418 419 debug(regexp) printf("regexp.rfind.unittest\n"); 420 i = rfind("abcdefcdef", RegExp("c")); 421 assert(i == 6); 422 i = rfind("abcdefcdef", RegExp("cd")); 423 assert(i == 6); 424 i = rfind("abcdefcdef", RegExp("x")); 425 assert(i == -1); 426 i = rfind("abcdefcdef", RegExp("xy")); 427 assert(i == -1); 428 i = rfind("abcdefcdef", RegExp("")); 429 assert(i == 10); 430 } 431 432 /************************************************* 433 Returns: 434 435 Same as $(D_PARAM rfind(s, RegExp(pattern, attributes))). 436 437 WARNING: 438 439 This function is scheduled for deprecation due to unnecessary 440 ambiguity with the homonym function in std.string. Instead of 441 $(D_PARAM undead.regexp.rfind(s, p, a)), you may want to use $(D_PARAM 442 rfind(s, RegExp(p, a))). 443 */ 444 445 sizediff_t 446 rfind(string s, string pattern, string attributes = null) 447 { 448 typeof(return) i = -1, lastindex = 0; 449 450 auto r = new RegExp(pattern, attributes); 451 while (r.test(s, lastindex)) 452 { 453 auto eo = r.pmatch[0].rm_eo; 454 i = r.pmatch[0].rm_so; 455 if (lastindex == eo) 456 lastindex++; // always consume some source 457 else 458 lastindex = eo; 459 } 460 r.destroy(); 461 return i; 462 } 463 464 unittest 465 { 466 sizediff_t i; 467 468 debug(regexp) printf("regexp.rfind.unittest\n"); 469 i = rfind("abcdefcdef", "c"); 470 assert(i == 6); 471 i = rfind("abcdefcdef", "cd"); 472 assert(i == 6); 473 i = rfind("abcdefcdef", "x"); 474 assert(i == -1); 475 i = rfind("abcdefcdef", "xy"); 476 assert(i == -1); 477 i = rfind("abcdefcdef", ""); 478 assert(i == 10); 479 } 480 481 482 /******************************************** 483 * Split s[] into an array of strings, using the regular 484 * expression $(D_PARAM pattern) as the separator. 485 * Params: 486 * s = String to search. 487 * pattern = Regular expression pattern. 488 * Returns: 489 * array of slices into s[] 490 * Example: 491 * --- 492 * foreach (s; split("abcabcabab", RegExp("C.", "i"))) 493 * { 494 * writefln("s = '%s'", s); 495 * } 496 * // Prints: 497 * // s = 'ab' 498 * // s = 'b' 499 * // s = 'bab' 500 * --- 501 */ 502 503 string[] split(string s, RegExp pattern) 504 { 505 return pattern.split(s); 506 } 507 508 unittest 509 { 510 debug(regexp) printf("regexp.split.unittest()\n"); 511 string[] result; 512 513 result = split("ab", RegExp("a*")); 514 assert(result.length == 2); 515 assert(result[0] == ""); 516 assert(result[1] == "b"); 517 518 foreach (i, s; split("abcabcabab", RegExp("C.", "i"))) 519 { 520 //writefln("s[%d] = '%s'", i, s); 521 if (i == 0) assert(s == "ab"); 522 else if (i == 1) assert(s == "b"); 523 else if (i == 2) assert(s == "bab"); 524 else assert(0); 525 } 526 } 527 528 /******************************************** 529 Returns: 530 Same as $(D_PARAM split(s, RegExp(pattern, attributes))). 531 532 WARNING: 533 534 This function is scheduled for deprecation due to unnecessary 535 ambiguity with the homonym function in std.string. Instead of 536 $(D_PARAM undead.regexp.split(s, p, a)), you may want to use $(D_PARAM 537 split(s, RegExp(p, a))). 538 */ 539 540 string[] split(string s, string pattern, string attributes = null) 541 { 542 auto r = new RegExp(pattern, attributes); 543 auto result = r.split(s); 544 r.destroy(); 545 return result; 546 } 547 548 unittest 549 { 550 debug(regexp) printf("regexp.split.unittest()\n"); 551 string[] result; 552 553 result = split("ab", "a*"); 554 assert(result.length == 2); 555 assert(result[0] == ""); 556 assert(result[1] == "b"); 557 558 foreach (i, s; split("abcabcabab", "C.", "i")) 559 { 560 //writefln("s[%d] = '%s'", i, s.length, s.ptr); 561 if (i == 0) assert(s == "ab"); 562 else if (i == 1) assert(s == "b"); 563 else if (i == 2) assert(s == "bab"); 564 else assert(0); 565 } 566 } 567 568 /**************************************************** 569 * Search s[] for first match with pattern[] with attributes[]. 570 * Params: 571 * s = String to search. 572 * pattern = Regular expression pattern. 573 * attributes = Regular expression attributes. 574 * Returns: 575 * corresponding RegExp if found, null if not. 576 * Example: 577 * --- 578 * import std.stdio; 579 * import undead.regexp; 580 * 581 * void main() 582 * { 583 * if (auto m = undead.regexp.search("abcdef", "c")) 584 * { 585 * writefln("%s[%s]%s", m.pre, m[0], m.post); 586 * } 587 * } 588 * // Prints: 589 * // ab[c]def 590 * --- 591 */ 592 593 RegExp search(string s, string pattern, string attributes = null) 594 { 595 auto r = new RegExp(pattern, attributes); 596 if (!r.test(s)) 597 { 598 r.destroy(); 599 r = null; 600 assert(r is null); 601 } 602 return r; 603 } 604 605 unittest 606 { 607 debug(regexp) printf("regexp.string.unittest()\n"); 608 609 if (auto m = undead.regexp.search("abcdef", "c()")) 610 { 611 auto result = std..string.format("%s[%s]%s", m.pre, m[0], m.post); 612 assert(result == "ab[c]def"); 613 assert(m[1] == null); 614 assert(m[2] == null); 615 } 616 else 617 assert(0); 618 619 if (auto n = undead.regexp.search("abcdef", "g")) 620 { 621 assert(0); 622 } 623 } 624 625 /* ********************************* RegExp ******************************** */ 626 627 /***************************** 628 * RegExp is a class to handle regular expressions. 629 * 630 * It is the core foundation for adding powerful string pattern matching 631 * capabilities to programs like grep, text editors, awk, sed, etc. 632 */ 633 class RegExp 634 { 635 /***** 636 * Construct a RegExp object. Compile pattern 637 * with <i>attributes</i> into 638 * an internal form for fast execution. 639 * Params: 640 * pattern = regular expression 641 * attributes = _attributes 642 * Throws: RegExpException if there are any compilation errors. 643 * Example: 644 * Declare two variables and assign to them a RegExp object: 645 * --- 646 * auto r = new RegExp("pattern"); 647 * auto s = new RegExp(r"p[1-5]\s*"); 648 * --- 649 */ 650 public this(string pattern, string attributes = null) 651 { 652 pmatch = (&gmatch)[0 .. 1]; 653 compile(pattern, attributes); 654 } 655 656 /***** 657 * Generate instance of RegExp. 658 * Params: 659 * pattern = regular expression 660 * attributes = _attributes 661 * Throws: RegExpException if there are any compilation errors. 662 * Example: 663 * Declare two variables and assign to them a RegExp object: 664 * --- 665 * auto r = RegExp("pattern"); 666 * auto s = RegExp(r"p[1-5]\s*"); 667 * --- 668 */ 669 public static RegExp opCall(string pattern, string attributes = null) 670 { 671 return new RegExp(pattern, attributes); 672 } 673 674 unittest 675 { 676 debug(regexp) printf("regexp.opCall.unittest()\n"); 677 auto r1 = RegExp("hello", "m"); 678 string msg; 679 try 680 { 681 auto r2 = RegExp("hello", "q"); 682 assert(0); 683 } 684 catch (RegExpException ree) 685 { 686 msg = ree.toString(); 687 //writefln("message: %s", ree); 688 } 689 assert(std.algorithm.countUntil(msg, "unrecognized attribute") >= 0); 690 } 691 692 /************************************ 693 * Set up for start of foreach loop. 694 * Returns: 695 * search() returns instance of RegExp set up to _search string[]. 696 * Example: 697 * --- 698 * import std.stdio; 699 * import undead.regexp; 700 * 701 * void main() 702 * { 703 * foreach(m; RegExp("ab").search("abcabcabab")) 704 * { 705 * writefln("%s[%s]%s", m.pre, m[0], m.post); 706 * } 707 * } 708 * // Prints: 709 * // [ab]cabcabab 710 * // abc[ab]cabab 711 * // abcabc[ab]ab 712 * // abcabcab[ab] 713 * --- 714 */ 715 716 public RegExp search(string string) 717 { 718 input = string; 719 pmatch[0].rm_eo = 0; 720 return this; 721 } 722 723 /** ditto */ 724 public int opApply(scope int delegate(ref RegExp) dg) 725 { 726 int result; 727 RegExp r = this; 728 729 while (test()) 730 { 731 result = dg(r); 732 if (result) 733 break; 734 } 735 736 return result; 737 } 738 739 unittest 740 { 741 debug(regexp) printf("regexp.search.unittest()\n"); 742 743 int i; 744 foreach(m; RegExp("ab").search("abcabcabab")) 745 { 746 auto s = std..string.format("%s[%s]%s", m.pre, m[0], m.post); 747 if (i == 0) assert(s == "[ab]cabcabab"); 748 else if (i == 1) assert(s == "abc[ab]cabab"); 749 else if (i == 2) assert(s == "abcabc[ab]ab"); 750 else if (i == 3) assert(s == "abcabcab[ab]"); 751 else assert(0); 752 i++; 753 } 754 } 755 756 /****************** 757 * Retrieve match n. 758 * 759 * n==0 means the matched substring, n>0 means the 760 * n'th parenthesized subexpression. 761 * if n is larger than the number of parenthesized subexpressions, 762 * null is returned. 763 */ 764 public string opIndex(size_t n) 765 { 766 if (n >= pmatch.length) 767 return null; 768 else 769 { 770 auto rm_so = pmatch[n].rm_so; 771 auto rm_eo = pmatch[n].rm_eo; 772 if (rm_so == rm_eo) 773 return null; 774 return input[rm_so .. rm_eo]; 775 } 776 } 777 778 /** 779 Same as $(D_PARAM opIndex(n)). 780 781 WARNING: 782 783 Scheduled for deprecation due to confusion with overloaded 784 $(D_PARAM match(string)). Instead of $(D_PARAM regex.match(n)) 785 you may want to use $(D_PARAM regex[n]). 786 */ 787 public string match(size_t n) 788 { 789 return this[n]; 790 } 791 792 /******************* 793 * Return the slice of the input that precedes the matched substring. 794 */ 795 public @property string pre() 796 { 797 return input[0 .. pmatch[0].rm_so]; 798 } 799 800 /******************* 801 * Return the slice of the input that follows the matched substring. 802 */ 803 public @property string post() 804 { 805 return input[pmatch[0].rm_eo .. $]; 806 } 807 808 uint re_nsub; // number of parenthesized subexpression matches 809 regmatch_t[] pmatch; // array [re_nsub + 1] 810 811 string input; // the string to search 812 813 // per instance: 814 815 string pattern; // source text of the regular expression 816 817 string flags; // source text of the attributes parameter 818 819 int errors; 820 821 uint attributes; 822 823 enum REA 824 { 825 global = 1, // has the g attribute 826 ignoreCase = 2, // has the i attribute 827 multiline = 4, // if treat as multiple lines separated 828 // by newlines, or as a single line 829 dotmatchlf = 8, // if . matches \n 830 } 831 832 833 private: 834 size_t src; // current source index in input[] 835 size_t src_start; // starting index for match in input[] 836 size_t p; // position of parser in pattern[] 837 regmatch_t gmatch; // match for the entire regular expression 838 // (serves as storage for pmatch[0]) 839 840 const(ubyte)[] program; // pattern[] compiled into regular expression program 841 OutBuffer buf; 842 843 844 845 846 /******************************************/ 847 848 // Opcodes 849 850 enum : ubyte 851 { 852 REend, // end of program 853 REchar, // single character 854 REichar, // single character, case insensitive 855 REdchar, // single UCS character 856 REidchar, // single wide character, case insensitive 857 REanychar, // any character 858 REanystar, // ".*" 859 REstring, // string of characters 860 REistring, // string of characters, case insensitive 861 REtestbit, // any in bitmap, non-consuming 862 REbit, // any in the bit map 863 REnotbit, // any not in the bit map 864 RErange, // any in the string 865 REnotrange, // any not in the string 866 REor, // a | b 867 REplus, // 1 or more 868 REstar, // 0 or more 869 REquest, // 0 or 1 870 REnm, // n..m 871 REnmq, // n..m, non-greedy version 872 REbol, // beginning of line 873 REeol, // end of line 874 REparen, // parenthesized subexpression 875 REgoto, // goto offset 876 877 REwordboundary, 878 REnotwordboundary, 879 REdigit, 880 REnotdigit, 881 REspace, 882 REnotspace, 883 REword, 884 REnotword, 885 REbackref, 886 }; 887 888 // BUG: should this include '$'? 889 private int isword(dchar c) { return isAlphaNum(c) || c == '_'; } 890 891 private uint inf = ~0u; 892 893 /* ******************************** 894 * Throws RegExpException on error 895 */ 896 897 public void compile(string pattern, string attributes) 898 { 899 //printf("RegExp.compile('%.*s', '%.*s')\n", pattern.length, pattern.ptr, attributes.length, attributes.ptr); 900 901 this.attributes = 0; 902 foreach (rchar c; attributes) 903 { REA att; 904 905 switch (c) 906 { 907 case 'g': att = REA.global; break; 908 case 'i': att = REA.ignoreCase; break; 909 case 'm': att = REA.multiline; break; 910 default: 911 error("unrecognized attribute"); 912 return; 913 } 914 if (this.attributes & att) 915 { error("redundant attribute"); 916 return; 917 } 918 this.attributes |= att; 919 } 920 921 input = null; 922 923 this.pattern = pattern; 924 this.flags = attributes; 925 926 uint oldre_nsub = re_nsub; 927 re_nsub = 0; 928 errors = 0; 929 930 buf = new OutBuffer(); 931 buf.reserve(pattern.length * 8); 932 p = 0; 933 parseRegexp(); 934 if (p < pattern.length) 935 { error("unmatched ')'"); 936 } 937 // @@@ SKIPPING OPTIMIZATION SOLVES BUG 941 @@@ 938 //optimize(); 939 program = buf.data; 940 buf.data = null; 941 buf.destroy(); 942 943 if (re_nsub > oldre_nsub) 944 { 945 if (pmatch.ptr is &gmatch) 946 pmatch = null; 947 pmatch.length = re_nsub + 1; 948 } 949 pmatch[0].rm_so = 0; 950 pmatch[0].rm_eo = 0; 951 } 952 953 /******************************************** 954 * Split s[] into an array of strings, using the regular 955 * expression as the separator. 956 * Returns: 957 * array of slices into s[] 958 */ 959 960 public string[] split(string s) 961 { 962 debug(regexp) printf("regexp.split()\n"); 963 964 string[] result; 965 966 if (s.length) 967 { 968 sizediff_t p, q; 969 for (q = p; q != s.length;) 970 { 971 if (test(s, q)) 972 { 973 q = pmatch[0].rm_so; 974 auto e = pmatch[0].rm_eo; 975 if (e != p) 976 { 977 result ~= s[p .. q]; 978 for (size_t i = 1; i < pmatch.length; i++) 979 { 980 auto so = pmatch[i].rm_so; 981 auto eo = pmatch[i].rm_eo; 982 if (so == eo) 983 { so = 0; // -1 gives array bounds error 984 eo = 0; 985 } 986 result ~= s[so .. eo]; 987 } 988 q = p = e; 989 continue; 990 } 991 } 992 q++; 993 } 994 result ~= s[p .. s.length]; 995 } 996 else if (!test(s)) 997 result ~= s; 998 return result; 999 } 1000 1001 unittest 1002 { 1003 debug(regexp) printf("regexp.split.unittest()\n"); 1004 1005 auto r = new RegExp("a*?", null); 1006 string[] result; 1007 string j; 1008 int i; 1009 1010 result = r.split("ab"); 1011 1012 assert(result.length == 2); 1013 i = (result[0] == "a"); 1014 assert(i == 1); 1015 i = (result[1] == "b"); 1016 assert(i == 1); 1017 1018 r = new RegExp("a*", null); 1019 result = r.split("ab"); 1020 assert(result.length == 2); 1021 i = (result[0] == ""); 1022 assert(i == 1); 1023 i = (result[1] == "b"); 1024 assert(i == 1); 1025 1026 r = new RegExp("<(\\/)?([^<>]+)>", null); 1027 result = r.split("a<b>font</b>bar<TAG>hello</TAG>"); 1028 1029 debug(regexp) 1030 { 1031 for (i = 0; i < result.length; i++) 1032 printf("result[%d] = '%.*s'\n", i, result[i].length, result[i].ptr); 1033 } 1034 1035 j = join(result, ","); 1036 //printf("j = '%.*s'\n", j.length, j.ptr); 1037 i = (j == "a,,b,font,/,b,bar,,TAG,hello,/,TAG,"); 1038 assert(i == 1); 1039 1040 r = new RegExp("a[bc]", null); 1041 result = r.match("123ab"); 1042 j = join(result, ","); 1043 i = (j == "ab"); 1044 assert(i == 1); 1045 1046 result = r.match("ac"); 1047 j = join(result, ","); 1048 i = (j == "ac"); 1049 assert(i == 1); 1050 } 1051 1052 /************************************************* 1053 * Search string[] for match with regular expression. 1054 * Returns: 1055 * index of match if successful, -1 if not found 1056 */ 1057 1058 public sizediff_t find(string string) 1059 { 1060 if (test(string)) 1061 return pmatch[0].rm_so; 1062 else 1063 return -1; // no match 1064 } 1065 1066 //deprecated alias find search; 1067 1068 unittest 1069 { 1070 debug(regexp) printf("regexp.find.unittest()\n"); 1071 1072 RegExp r = new RegExp("abc", null); 1073 auto i = r.find("xabcy"); 1074 assert(i == 1); 1075 i = r.find("cba"); 1076 assert(i == -1); 1077 } 1078 1079 1080 /************************************************* 1081 * Search s[] for match. 1082 * Returns: 1083 * If global attribute, return same value as exec(s). 1084 * If not global attribute, return array of all matches. 1085 */ 1086 1087 public string[] match(string s) 1088 { 1089 string[] result; 1090 1091 if (attributes & REA.global) 1092 { 1093 sizediff_t lastindex = 0; 1094 1095 while (test(s, lastindex)) 1096 { 1097 auto eo = pmatch[0].rm_eo; 1098 1099 result ~= input[pmatch[0].rm_so .. eo]; 1100 if (lastindex == eo) 1101 lastindex++; // always consume some source 1102 else 1103 lastindex = eo; 1104 } 1105 } 1106 else 1107 { 1108 result = exec(s); 1109 } 1110 return result; 1111 } 1112 1113 unittest 1114 { 1115 debug(regexp) printf("regexp.match.unittest()\n"); 1116 1117 int i; 1118 string[] result; 1119 string j; 1120 RegExp r; 1121 1122 r = new RegExp("a[bc]", null); 1123 result = r.match("1ab2ac3"); 1124 j = join(result, ","); 1125 i = (j == "ab"); 1126 assert(i == 1); 1127 1128 r = new RegExp("a[bc]", "g"); 1129 result = r.match("1ab2ac3"); 1130 j = join(result, ","); 1131 i = (j == "ab,ac"); 1132 assert(i == 1); 1133 } 1134 1135 1136 /************************************************* 1137 * Find regular expression matches in s[]. Replace those matches 1138 * with a new string composed of format[] merged with the result of the 1139 * matches. 1140 * If global, replace all matches. Otherwise, replace first match. 1141 * Returns: the new string 1142 */ 1143 1144 public string replace(string s, string format) 1145 { 1146 debug(regexp) printf("string = %.*s, format = %.*s\n", s.length, s.ptr, format.length, format.ptr); 1147 1148 string result = s; 1149 sizediff_t lastindex = 0; 1150 size_t offset = 0; 1151 1152 for (;;) 1153 { 1154 if (!test(s, lastindex)) 1155 break; 1156 1157 auto so = pmatch[0].rm_so; 1158 auto eo = pmatch[0].rm_eo; 1159 1160 string replacement = replace(format); 1161 1162 // Optimize by using replace if possible - Dave Fladebo 1163 string slice = result[offset + so .. offset + eo]; 1164 if (attributes & REA.global && // global, so replace all 1165 !(attributes & REA.ignoreCase) && // not ignoring case 1166 !(attributes & REA.multiline) && // not multiline 1167 pattern == slice && // simple pattern (exact match, no special characters) 1168 format == replacement) // simple format, not $ formats 1169 { 1170 debug(regexp) 1171 { 1172 auto sss = result[offset + so .. offset + eo]; 1173 printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n", 1174 pattern.length, pattern.ptr, sss.length, sss.ptr, format.length, format.ptr, replacement.length, replacement.ptr); 1175 } 1176 result = std.array.replace(result,slice,replacement); 1177 break; 1178 } 1179 1180 result = replaceSlice(result, result[offset + so .. offset + eo], replacement); 1181 1182 if (attributes & REA.global) 1183 { 1184 offset += replacement.length - (eo - so); 1185 1186 if (lastindex == eo) 1187 lastindex++; // always consume some source 1188 else 1189 lastindex = eo; 1190 } 1191 else 1192 break; 1193 } 1194 1195 return result; 1196 } 1197 1198 unittest 1199 { 1200 debug(regexp) printf("regexp.replace.unittest()\n"); 1201 1202 int i; 1203 string result; 1204 RegExp r; 1205 1206 r = new RegExp("a[bc]", "g"); 1207 result = r.replace("1ab2ac3", "x$&y"); 1208 i = (result == "1xaby2xacy3"); 1209 assert(i == 1); 1210 1211 r = new RegExp("ab", "g"); 1212 result = r.replace("1ab2ac3", "xy"); 1213 i = (result == "1xy2ac3"); 1214 assert(i == 1); 1215 } 1216 1217 1218 /************************************************* 1219 * Search string[] for match. 1220 * Returns: 1221 * array of slices into string[] representing matches 1222 */ 1223 1224 public string[] exec(string s) 1225 { 1226 debug(regexp) printf("regexp.exec(string = '%.*s')\n", s.length, s.ptr); 1227 input = s; 1228 pmatch[0].rm_so = 0; 1229 pmatch[0].rm_eo = 0; 1230 return exec(); 1231 } 1232 1233 /************************************************* 1234 * Pick up where last exec(string) or exec() left off, 1235 * searching string[] for next match. 1236 * Returns: 1237 * array of slices into string[] representing matches 1238 */ 1239 1240 public string[] exec() 1241 { 1242 if (!test()) 1243 return null; 1244 1245 auto result = new string[pmatch.length]; 1246 for (int i = 0; i < pmatch.length; i++) 1247 { 1248 if (pmatch[i].rm_so == pmatch[i].rm_eo) 1249 result[i] = null; 1250 else 1251 result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo]; 1252 } 1253 1254 return result; 1255 } 1256 1257 /************************************************ 1258 * Search s[] for match. 1259 * Returns: 0 for no match, !=0 for match 1260 * Example: 1261 --- 1262 import std.stdio; 1263 import undead.regexp; 1264 import std.string; 1265 1266 int grep(int delegate(char[]) pred, char[][] list) 1267 { 1268 int count; 1269 foreach (s; list) 1270 { if (pred(s)) 1271 ++count; 1272 } 1273 return count; 1274 } 1275 1276 void main() 1277 { 1278 auto x = grep(&RegExp("[Ff]oo").test, 1279 std.string.split("mary had a foo lamb")); 1280 writefln(x); 1281 } 1282 --- 1283 * which prints: 1 1284 */ 1285 //@@@ 1286 public bool test(string s) 1287 { 1288 return test(s, 0 /*pmatch[0].rm_eo*/) != 0; 1289 } 1290 1291 /************************************************ 1292 * Pick up where last test(string) or test() left off, and search again. 1293 * Returns: 0 for no match, !=0 for match 1294 */ 1295 1296 public int test() 1297 { 1298 return test(input, pmatch[0].rm_eo); 1299 } 1300 1301 /************************************************ 1302 * Test s[] starting at startindex against regular expression. 1303 * Returns: 0 for no match, !=0 for match 1304 */ 1305 1306 public int test(string s, size_t startindex) 1307 { 1308 char firstc; 1309 1310 input = s; 1311 debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %zd)\n", input.length, input.ptr, startindex); 1312 pmatch[0].rm_so = 0; 1313 pmatch[0].rm_eo = 0; 1314 if (startindex < 0 || startindex > input.length) 1315 { 1316 return 0; // fail 1317 } 1318 //debug(regexp) printProgram(program); 1319 1320 // First character optimization 1321 firstc = 0; 1322 if (program[0] == REchar) 1323 { 1324 firstc = program[1]; 1325 if (attributes & REA.ignoreCase && isAlpha(firstc)) 1326 firstc = 0; 1327 } 1328 1329 for (auto si = startindex; ; si++) 1330 { 1331 if (firstc) 1332 { 1333 if (si == input.length) 1334 break; // no match 1335 if (input[si] != firstc) 1336 { 1337 si++; 1338 if (!chr(si, firstc)) // if first character not found 1339 break; // no match 1340 } 1341 } 1342 for (size_t i = 0; i < re_nsub + 1; i++) 1343 { 1344 pmatch[i].rm_so = -1; 1345 pmatch[i].rm_eo = -1; 1346 } 1347 src_start = src = si; 1348 if (trymatch(0, program.length)) 1349 { 1350 pmatch[0].rm_so = si; 1351 pmatch[0].rm_eo = src; 1352 //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo); 1353 return 1; 1354 } 1355 // If possible match must start at beginning, we are done 1356 if (program[0] == REbol || program[0] == REanystar) 1357 { 1358 if (attributes & REA.multiline) 1359 { 1360 // Scan for the next \n 1361 if (!chr(si, '\n')) 1362 break; // no match if '\n' not found 1363 } 1364 else 1365 break; 1366 } 1367 if (si == input.length) 1368 break; 1369 debug(regexp) 1370 { 1371 auto sss = input[si + 1 .. input.length]; 1372 printf("Starting new try: '%.*s'\n", sss.length, sss.ptr); 1373 } 1374 } 1375 return 0; // no match 1376 } 1377 1378 /** 1379 Returns whether string $(D_PARAM s) matches $(D_PARAM this). 1380 */ 1381 alias test opEquals; 1382 // bool opEquals(string s) 1383 // { 1384 // return test(s); 1385 // } 1386 1387 unittest 1388 { 1389 assert("abc" == RegExp(".b.")); 1390 assert("abc" != RegExp(".b..")); 1391 } 1392 1393 int chr(ref size_t si, rchar c) 1394 { 1395 for (; si < input.length; si++) 1396 { 1397 if (input[si] == c) 1398 return 1; 1399 } 1400 return 0; 1401 } 1402 1403 1404 void printProgram(const(ubyte)[] prog) 1405 { 1406 //debug(regexp) 1407 { 1408 size_t len; 1409 uint n; 1410 uint m; 1411 ushort *pu; 1412 uint *puint; 1413 char[] str; 1414 1415 printf("printProgram()\n"); 1416 for (size_t pc = 0; pc < prog.length; ) 1417 { 1418 printf("%3d: ", cast(int)pc); 1419 1420 //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq); 1421 switch (prog[pc]) 1422 { 1423 case REchar: 1424 printf("\tREchar '%c'\n", prog[pc + 1]); 1425 pc += 1 + char.sizeof; 1426 break; 1427 1428 case REichar: 1429 printf("\tREichar '%c'\n", prog[pc + 1]); 1430 pc += 1 + char.sizeof; 1431 break; 1432 1433 case REdchar: 1434 printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]); 1435 pc += 1 + dchar.sizeof; 1436 break; 1437 1438 case REidchar: 1439 printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]); 1440 pc += 1 + dchar.sizeof; 1441 break; 1442 1443 case REanychar: 1444 printf("\tREanychar\n"); 1445 pc++; 1446 break; 1447 1448 case REstring: 1449 len = *cast(size_t *)&prog[pc + 1]; 1450 str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len]; 1451 printf("\tREstring x%x, '%.*s'\n", cast(int)len, cast(int)str.length, str.ptr); 1452 pc += 1 + size_t.sizeof + len * rchar.sizeof; 1453 break; 1454 1455 case REistring: 1456 len = *cast(size_t *)&prog[pc + 1]; 1457 str = (cast(char*)&prog[pc + 1 + size_t.sizeof])[0 .. len]; 1458 printf("\tREistring x%x, '%.*s'\n", cast(int)len, cast(int)str.length, str.ptr); 1459 pc += 1 + size_t.sizeof + len * rchar.sizeof; 1460 break; 1461 1462 case REtestbit: 1463 pu = cast(ushort *)&prog[pc + 1]; 1464 printf("\tREtestbit %d, %d\n", pu[0], pu[1]); 1465 len = pu[1]; 1466 pc += 1 + 2 * ushort.sizeof + len; 1467 break; 1468 1469 case REbit: 1470 pu = cast(ushort *)&prog[pc + 1]; 1471 len = pu[1]; 1472 printf("\tREbit cmax=%02x, len=%d:", pu[0], cast(int)len); 1473 for (n = 0; n < len; n++) 1474 printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]); 1475 printf("\n"); 1476 pc += 1 + 2 * ushort.sizeof + len; 1477 break; 1478 1479 case REnotbit: 1480 pu = cast(ushort *)&prog[pc + 1]; 1481 printf("\tREnotbit %d, %d\n", pu[0], pu[1]); 1482 len = pu[1]; 1483 pc += 1 + 2 * ushort.sizeof + len; 1484 break; 1485 1486 case RErange: 1487 len = *cast(uint *)&prog[pc + 1]; 1488 printf("\tRErange %d\n", cast(int)len); 1489 // BUG: REAignoreCase? 1490 pc += 1 + uint.sizeof + len; 1491 break; 1492 1493 case REnotrange: 1494 len = *cast(uint *)&prog[pc + 1]; 1495 printf("\tREnotrange %d\n", cast(int)len); 1496 // BUG: REAignoreCase? 1497 pc += 1 + uint.sizeof + len; 1498 break; 1499 1500 case REbol: 1501 printf("\tREbol\n"); 1502 pc++; 1503 break; 1504 1505 case REeol: 1506 printf("\tREeol\n"); 1507 pc++; 1508 break; 1509 1510 case REor: 1511 len = *cast(uint *)&prog[pc + 1]; 1512 printf("\tREor %d, pc=>%d\n", cast(int)len, cast(int)(pc + 1 + uint.sizeof + len)); 1513 pc += 1 + uint.sizeof; 1514 break; 1515 1516 case REgoto: 1517 len = *cast(uint *)&prog[pc + 1]; 1518 printf("\tREgoto %d, pc=>%d\n", cast(int)len, cast(int)(pc + 1 + uint.sizeof + len)); 1519 pc += 1 + uint.sizeof; 1520 break; 1521 1522 case REanystar: 1523 printf("\tREanystar\n"); 1524 pc++; 1525 break; 1526 1527 case REnm: 1528 case REnmq: 1529 // len, n, m, () 1530 puint = cast(uint *)&prog[pc + 1]; 1531 len = puint[0]; 1532 n = puint[1]; 1533 m = puint[2]; 1534 printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n", 1535 (prog[pc] == REnmq) ? "q".ptr : " ".ptr, 1536 cast(int)len, n, m, cast(int)(pc + 1 + uint.sizeof * 3 + len)); 1537 pc += 1 + uint.sizeof * 3; 1538 break; 1539 1540 case REparen: 1541 // len, n, () 1542 puint = cast(uint *)&prog[pc + 1]; 1543 len = puint[0]; 1544 n = puint[1]; 1545 printf("\tREparen len=%d n=%d, pc=>%d\n", cast(int)len, n, cast(int)(pc + 1 + uint.sizeof * 2 + len)); 1546 pc += 1 + uint.sizeof * 2; 1547 break; 1548 1549 case REend: 1550 printf("\tREend\n"); 1551 return; 1552 1553 case REwordboundary: 1554 printf("\tREwordboundary\n"); 1555 pc++; 1556 break; 1557 1558 case REnotwordboundary: 1559 printf("\tREnotwordboundary\n"); 1560 pc++; 1561 break; 1562 1563 case REdigit: 1564 printf("\tREdigit\n"); 1565 pc++; 1566 break; 1567 1568 case REnotdigit: 1569 printf("\tREnotdigit\n"); 1570 pc++; 1571 break; 1572 1573 case REspace: 1574 printf("\tREspace\n"); 1575 pc++; 1576 break; 1577 1578 case REnotspace: 1579 printf("\tREnotspace\n"); 1580 pc++; 1581 break; 1582 1583 case REword: 1584 printf("\tREword\n"); 1585 pc++; 1586 break; 1587 1588 case REnotword: 1589 printf("\tREnotword\n"); 1590 pc++; 1591 break; 1592 1593 case REbackref: 1594 printf("\tREbackref %d\n", prog[1]); 1595 pc += 2; 1596 break; 1597 1598 default: 1599 assert(0); 1600 } 1601 } 1602 } 1603 } 1604 1605 1606 /************************************************** 1607 * Match input against a section of the program[]. 1608 * Returns: 1609 * 1 if successful match 1610 * 0 no match 1611 */ 1612 1613 int trymatch(size_t pc, size_t pcend) 1614 { 1615 size_t len; 1616 size_t n; 1617 size_t m; 1618 size_t count; 1619 size_t pop; 1620 size_t ss; 1621 regmatch_t *psave; 1622 size_t c1; 1623 size_t c2; 1624 ushort* pu; 1625 uint* puint; 1626 1627 debug(regexp) 1628 { 1629 auto sss = input[src .. input.length]; 1630 printf("RegExp.trymatch(pc = %zd, src = '%.*s', pcend = %zd)\n", pc, sss.length, sss.ptr, pcend); 1631 } 1632 auto srcsave = src; 1633 psave = null; 1634 for (;;) 1635 { 1636 if (pc == pcend) // if done matching 1637 { debug(regex) printf("\tprogend\n"); 1638 return 1; 1639 } 1640 1641 //printf("\top = %d\n", program[pc]); 1642 switch (program[pc]) 1643 { 1644 case REchar: 1645 if (src == input.length) 1646 goto Lnomatch; 1647 debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]); 1648 if (program[pc + 1] != input[src]) 1649 goto Lnomatch; 1650 src++; 1651 pc += 1 + char.sizeof; 1652 break; 1653 1654 case REichar: 1655 if (src == input.length) 1656 goto Lnomatch; 1657 debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]); 1658 c1 = program[pc + 1]; 1659 c2 = input[src]; 1660 if (c1 != c2) 1661 { 1662 if (isLower(cast(rchar)c2)) 1663 c2 = std.ascii.toUpper(cast(rchar)c2); 1664 else 1665 goto Lnomatch; 1666 if (c1 != c2) 1667 goto Lnomatch; 1668 } 1669 src++; 1670 pc += 1 + char.sizeof; 1671 break; 1672 1673 case REdchar: 1674 debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); 1675 if (src == input.length) 1676 goto Lnomatch; 1677 if (*(cast(dchar *)&program[pc + 1]) != input[src]) 1678 goto Lnomatch; 1679 src++; 1680 pc += 1 + dchar.sizeof; 1681 break; 1682 1683 case REidchar: 1684 debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]); 1685 if (src == input.length) 1686 goto Lnomatch; 1687 c1 = *(cast(dchar *)&program[pc + 1]); 1688 c2 = input[src]; 1689 if (c1 != c2) 1690 { 1691 if (isLower(cast(rchar)c2)) 1692 c2 = std.ascii.toUpper(cast(rchar)c2); 1693 else 1694 goto Lnomatch; 1695 if (c1 != c2) 1696 goto Lnomatch; 1697 } 1698 src++; 1699 pc += 1 + dchar.sizeof; 1700 break; 1701 1702 case REanychar: 1703 debug(regexp) printf("\tREanychar\n"); 1704 if (src == input.length) 1705 goto Lnomatch; 1706 if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n') 1707 goto Lnomatch; 1708 src += std.utf.stride(input, src); 1709 //src++; 1710 pc++; 1711 break; 1712 1713 case REstring: 1714 len = *cast(size_t *)&program[pc + 1]; 1715 debug(regexp) 1716 { 1717 auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len]; 1718 printf("\tREstring x%x, '%.*s'\n", len, sss2.length, sss2.ptr); 1719 } 1720 if (src + len > input.length) 1721 goto Lnomatch; 1722 if (memcmp(&program[pc + 1 + size_t.sizeof], &input[src], len * rchar.sizeof)) 1723 goto Lnomatch; 1724 src += len; 1725 pc += 1 + size_t.sizeof + len * rchar.sizeof; 1726 break; 1727 1728 case REistring: 1729 len = *cast(size_t *)&program[pc + 1]; 1730 debug(regexp) 1731 { 1732 auto sss2 = (&program[pc + 1 + size_t.sizeof])[0 .. len]; 1733 printf("\tREistring x%x, '%.*s'\n", len, sss2.length, sss2.ptr); 1734 } 1735 if (src + len > input.length) 1736 goto Lnomatch; 1737 if (icmp((cast(char*)&program[pc + 1 + size_t.sizeof])[0..len], 1738 input[src .. src + len])) 1739 goto Lnomatch; 1740 src += len; 1741 pc += 1 + size_t.sizeof + len * rchar.sizeof; 1742 break; 1743 1744 case REtestbit: 1745 pu = (cast(ushort *)&program[pc + 1]); 1746 if (src == input.length) 1747 goto Lnomatch; 1748 debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n", 1749 pu[0], pu[1], input[src], input[src]); 1750 len = pu[1]; 1751 c1 = input[src]; 1752 //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7))); 1753 if (c1 <= pu[0] && 1754 !((&(program[pc + 1 + 4]))[c1 >> 3] & (1 << (c1 & 7)))) 1755 goto Lnomatch; 1756 pc += 1 + 2 * ushort.sizeof + len; 1757 break; 1758 1759 case REbit: 1760 pu = (cast(ushort *)&program[pc + 1]); 1761 if (src == input.length) 1762 goto Lnomatch; 1763 debug(regexp) printf("\tREbit %d, %d, '%c'\n", 1764 pu[0], pu[1], input[src]); 1765 len = pu[1]; 1766 c1 = input[src]; 1767 if (c1 > pu[0]) 1768 goto Lnomatch; 1769 if (!((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) 1770 goto Lnomatch; 1771 src++; 1772 pc += 1 + 2 * ushort.sizeof + len; 1773 break; 1774 1775 case REnotbit: 1776 pu = (cast(ushort *)&program[pc + 1]); 1777 if (src == input.length) 1778 goto Lnomatch; 1779 debug(regexp) printf("\tREnotbit %d, %d, '%c'\n", 1780 pu[0], pu[1], input[src]); 1781 len = pu[1]; 1782 c1 = input[src]; 1783 if (c1 <= pu[0] && 1784 ((&program[pc + 1 + 4])[c1 >> 3] & (1 << (c1 & 7)))) 1785 goto Lnomatch; 1786 src++; 1787 pc += 1 + 2 * ushort.sizeof + len; 1788 break; 1789 1790 case RErange: 1791 len = *cast(uint *)&program[pc + 1]; 1792 debug(regexp) printf("\tRErange %d\n", len); 1793 if (src == input.length) 1794 goto Lnomatch; 1795 // BUG: REA.ignoreCase? 1796 if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null) 1797 goto Lnomatch; 1798 src++; 1799 pc += 1 + uint.sizeof + len; 1800 break; 1801 1802 case REnotrange: 1803 len = *cast(uint *)&program[pc + 1]; 1804 debug(regexp) printf("\tREnotrange %d\n", len); 1805 if (src == input.length) 1806 goto Lnomatch; 1807 // BUG: REA.ignoreCase? 1808 if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null) 1809 goto Lnomatch; 1810 src++; 1811 pc += 1 + uint.sizeof + len; 1812 break; 1813 1814 case REbol: 1815 debug(regexp) printf("\tREbol\n"); 1816 if (src == 0) 1817 { 1818 } 1819 else if (attributes & REA.multiline) 1820 { 1821 if (input[src - 1] != '\n') 1822 goto Lnomatch; 1823 } 1824 else 1825 goto Lnomatch; 1826 pc++; 1827 break; 1828 1829 case REeol: 1830 debug(regexp) printf("\tREeol\n"); 1831 if (src == input.length) 1832 { 1833 } 1834 else if (attributes & REA.multiline && input[src] == '\n') 1835 src++; 1836 else 1837 goto Lnomatch; 1838 pc++; 1839 break; 1840 1841 case REor: 1842 len = (cast(uint *)&program[pc + 1])[0]; 1843 debug(regexp) printf("\tREor %d\n", len); 1844 pop = pc + 1 + uint.sizeof; 1845 ss = src; 1846 if (trymatch(pop, pcend)) 1847 { 1848 if (pcend != program.length) 1849 { 1850 auto s = src; 1851 if (trymatch(pcend, program.length)) 1852 { debug(regexp) printf("\tfirst operand matched\n"); 1853 src = s; 1854 return 1; 1855 } 1856 else 1857 { 1858 // If second branch doesn't match to end, take first anyway 1859 src = ss; 1860 if (!trymatch(pop + len, program.length)) 1861 { 1862 debug(regexp) printf("\tfirst operand matched\n"); 1863 src = s; 1864 return 1; 1865 } 1866 } 1867 src = ss; 1868 } 1869 else 1870 { debug(regexp) printf("\tfirst operand matched\n"); 1871 return 1; 1872 } 1873 } 1874 pc = pop + len; // proceed with 2nd branch 1875 break; 1876 1877 case REgoto: 1878 debug(regexp) printf("\tREgoto\n"); 1879 len = (cast(uint *)&program[pc + 1])[0]; 1880 pc += 1 + uint.sizeof + len; 1881 break; 1882 1883 case REanystar: 1884 debug(regexp) printf("\tREanystar\n"); 1885 pc++; 1886 for (;;) 1887 { 1888 auto s1 = src; 1889 if (src == input.length) 1890 break; 1891 if (!(attributes & REA.dotmatchlf) && input[src] == '\n') 1892 break; 1893 src++; 1894 auto s2 = src; 1895 1896 // If no match after consumption, but it 1897 // did match before, then no match 1898 if (!trymatch(pc, program.length)) 1899 { 1900 src = s1; 1901 // BUG: should we save/restore pmatch[]? 1902 if (trymatch(pc, program.length)) 1903 { 1904 src = s1; // no match 1905 break; 1906 } 1907 } 1908 src = s2; 1909 } 1910 break; 1911 1912 case REnm: 1913 case REnmq: 1914 // len, n, m, () 1915 puint = cast(uint *)&program[pc + 1]; 1916 len = puint[0]; 1917 n = puint[1]; 1918 m = puint[2]; 1919 debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", 1920 (program[pc] == REnmq) ? "q".ptr : "".ptr, len, n, m); 1921 pop = pc + 1 + uint.sizeof * 3; 1922 for (count = 0; count < n; count++) 1923 { 1924 if (!trymatch(pop, pop + len)) 1925 goto Lnomatch; 1926 } 1927 if (!psave && count < m) 1928 { 1929 //version (Win32) 1930 psave = cast(regmatch_t *)alloca((re_nsub + 1) * regmatch_t.sizeof); 1931 //else 1932 //psave = new regmatch_t[re_nsub + 1]; 1933 } 1934 if (program[pc] == REnmq) // if minimal munch 1935 { 1936 for (; count < m; count++) 1937 { 1938 memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof); 1939 auto s1 = src; 1940 1941 if (trymatch(pop + len, program.length)) 1942 { 1943 src = s1; 1944 memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof); 1945 break; 1946 } 1947 1948 if (!trymatch(pop, pop + len)) 1949 { debug(regexp) printf("\tdoesn't match subexpression\n"); 1950 break; 1951 } 1952 1953 // If source is not consumed, don't 1954 // infinite loop on the match 1955 if (s1 == src) 1956 { debug(regexp) printf("\tsource is not consumed\n"); 1957 break; 1958 } 1959 } 1960 } 1961 else // maximal munch 1962 { 1963 for (; count < m; count++) 1964 { 1965 memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof); 1966 auto s1 = src; 1967 if (!trymatch(pop, pop + len)) 1968 { debug(regexp) printf("\tdoesn't match subexpression\n"); 1969 break; 1970 } 1971 auto s2 = src; 1972 1973 // If source is not consumed, don't 1974 // infinite loop on the match 1975 if (s1 == s2) 1976 { debug(regexp) printf("\tsource is not consumed\n"); 1977 break; 1978 } 1979 1980 // If no match after consumption, but it 1981 // did match before, then no match 1982 if (!trymatch(pop + len, program.length)) 1983 { 1984 src = s1; 1985 if (trymatch(pop + len, program.length)) 1986 { 1987 src = s1; // no match 1988 memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof); 1989 break; 1990 } 1991 } 1992 src = s2; 1993 } 1994 } 1995 debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count); 1996 pc = pop + len; 1997 break; 1998 1999 case REparen: 2000 // len, () 2001 debug(regexp) printf("\tREparen\n"); 2002 puint = cast(uint *)&program[pc + 1]; 2003 len = puint[0]; 2004 n = puint[1]; 2005 pop = pc + 1 + uint.sizeof * 2; 2006 ss = src; 2007 if (!trymatch(pop, pop + len)) 2008 goto Lnomatch; 2009 pmatch[n + 1].rm_so = ss; 2010 pmatch[n + 1].rm_eo = src; 2011 pc = pop + len; 2012 break; 2013 2014 case REend: 2015 debug(regexp) printf("\tREend\n"); 2016 return 1; // successful match 2017 2018 case REwordboundary: 2019 debug(regexp) printf("\tREwordboundary\n"); 2020 if (src > 0 && src < input.length) 2021 { 2022 c1 = input[src - 1]; 2023 c2 = input[src]; 2024 if (!( 2025 (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || 2026 (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) 2027 ) 2028 ) 2029 goto Lnomatch; 2030 } 2031 pc++; 2032 break; 2033 2034 case REnotwordboundary: 2035 debug(regexp) printf("\tREnotwordboundary\n"); 2036 if (src == 0 || src == input.length) 2037 goto Lnomatch; 2038 c1 = input[src - 1]; 2039 c2 = input[src]; 2040 if ( 2041 (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) || 2042 (!isword(cast(rchar)c1) && isword(cast(rchar)c2)) 2043 ) 2044 goto Lnomatch; 2045 pc++; 2046 break; 2047 2048 case REdigit: 2049 debug(regexp) printf("\tREdigit\n"); 2050 if (src == input.length) 2051 goto Lnomatch; 2052 if (!isDigit(input[src])) 2053 goto Lnomatch; 2054 src++; 2055 pc++; 2056 break; 2057 2058 case REnotdigit: 2059 debug(regexp) printf("\tREnotdigit\n"); 2060 if (src == input.length) 2061 goto Lnomatch; 2062 if (isDigit(input[src])) 2063 goto Lnomatch; 2064 src++; 2065 pc++; 2066 break; 2067 2068 case REspace: 2069 debug(regexp) printf("\tREspace\n"); 2070 if (src == input.length) 2071 goto Lnomatch; 2072 if (!isWhite(input[src])) 2073 goto Lnomatch; 2074 src++; 2075 pc++; 2076 break; 2077 2078 case REnotspace: 2079 debug(regexp) printf("\tREnotspace\n"); 2080 if (src == input.length) 2081 goto Lnomatch; 2082 if (isWhite(input[src])) 2083 goto Lnomatch; 2084 src++; 2085 pc++; 2086 break; 2087 2088 case REword: 2089 debug(regexp) printf("\tREword\n"); 2090 if (src == input.length) 2091 goto Lnomatch; 2092 if (!isword(input[src])) 2093 goto Lnomatch; 2094 src++; 2095 pc++; 2096 break; 2097 2098 case REnotword: 2099 debug(regexp) printf("\tREnotword\n"); 2100 if (src == input.length) 2101 goto Lnomatch; 2102 if (isword(input[src])) 2103 goto Lnomatch; 2104 src++; 2105 pc++; 2106 break; 2107 2108 case REbackref: 2109 { 2110 n = program[pc + 1]; 2111 debug(regexp) printf("\tREbackref %d\n", n); 2112 2113 auto so = pmatch[n + 1].rm_so; 2114 auto eo = pmatch[n + 1].rm_eo; 2115 len = eo - so; 2116 if (src + len > input.length) 2117 goto Lnomatch; 2118 else if (attributes & REA.ignoreCase) 2119 { 2120 if (icmp(input[src .. src + len], input[so .. eo])) 2121 goto Lnomatch; 2122 } 2123 else if (memcmp(&input[src], &input[so], len * rchar.sizeof)) 2124 goto Lnomatch; 2125 src += len; 2126 pc += 2; 2127 break; 2128 } 2129 2130 default: 2131 assert(0); 2132 } 2133 } 2134 2135 Lnomatch: 2136 debug(regexp) printf("\tnomatch pc=%d\n", pc); 2137 src = srcsave; 2138 return 0; 2139 } 2140 2141 /* =================== Compiler ================== */ 2142 2143 int parseRegexp() 2144 { 2145 size_t gotooffset; 2146 uint len1; 2147 uint len2; 2148 2149 debug(regexp) 2150 { 2151 auto sss = pattern[p .. pattern.length]; 2152 printf("parseRegexp() '%.*s'\n", sss.length, sss.ptr); 2153 } 2154 auto offset = buf.offset; 2155 for (;;) 2156 { 2157 assert(p <= pattern.length); 2158 if (p == pattern.length) 2159 { buf.write(REend); 2160 return 1; 2161 } 2162 switch (pattern[p]) 2163 { 2164 case ')': 2165 return 1; 2166 2167 case '|': 2168 p++; 2169 gotooffset = buf.offset; 2170 buf.write(REgoto); 2171 buf.write(cast(uint)0); 2172 len1 = cast(uint)(buf.offset - offset); 2173 buf.spread(offset, 1 + uint.sizeof); 2174 gotooffset += 1 + uint.sizeof; 2175 parseRegexp(); 2176 len2 = cast(uint)(buf.offset - (gotooffset + 1 + uint.sizeof)); 2177 buf.data[offset] = REor; 2178 (cast(uint *)&buf.data[offset + 1])[0] = len1; 2179 (cast(uint *)&buf.data[gotooffset + 1])[0] = len2; 2180 break; 2181 2182 default: 2183 parsePiece(); 2184 break; 2185 } 2186 } 2187 } 2188 2189 int parsePiece() 2190 { 2191 uint len; 2192 uint n; 2193 uint m; 2194 ubyte op; 2195 auto plength = pattern.length; 2196 2197 debug(regexp) 2198 { 2199 auto sss = pattern[p .. pattern.length]; 2200 printf("parsePiece() '%.*s'\n", sss.length, sss.ptr); 2201 } 2202 auto offset = buf.offset; 2203 parseAtom(); 2204 if (p == plength) 2205 return 1; 2206 switch (pattern[p]) 2207 { 2208 case '*': 2209 // Special optimization: replace .* with REanystar 2210 if (buf.offset - offset == 1 && 2211 buf.data[offset] == REanychar && 2212 p + 1 < plength && 2213 pattern[p + 1] != '?') 2214 { 2215 buf.data[offset] = REanystar; 2216 p++; 2217 break; 2218 } 2219 2220 n = 0; 2221 m = inf; 2222 goto Lnm; 2223 2224 case '+': 2225 n = 1; 2226 m = inf; 2227 goto Lnm; 2228 2229 case '?': 2230 n = 0; 2231 m = 1; 2232 goto Lnm; 2233 2234 case '{': // {n} {n,} {n,m} 2235 p++; 2236 if (p == plength || !isDigit(pattern[p])) 2237 goto Lerr; 2238 n = 0; 2239 do 2240 { 2241 // BUG: handle overflow 2242 n = n * 10 + pattern[p] - '0'; 2243 p++; 2244 if (p == plength) 2245 goto Lerr; 2246 } while (isDigit(pattern[p])); 2247 if (pattern[p] == '}') // {n} 2248 { m = n; 2249 goto Lnm; 2250 } 2251 if (pattern[p] != ',') 2252 goto Lerr; 2253 p++; 2254 if (p == plength) 2255 goto Lerr; 2256 if (pattern[p] == /*{*/ '}') // {n,} 2257 { m = inf; 2258 goto Lnm; 2259 } 2260 if (!isDigit(pattern[p])) 2261 goto Lerr; 2262 m = 0; // {n,m} 2263 do 2264 { 2265 // BUG: handle overflow 2266 m = m * 10 + pattern[p] - '0'; 2267 p++; 2268 if (p == plength) 2269 goto Lerr; 2270 } while (isDigit(pattern[p])); 2271 if (pattern[p] != /*{*/ '}') 2272 goto Lerr; 2273 goto Lnm; 2274 2275 Lnm: 2276 p++; 2277 op = REnm; 2278 if (p < plength && pattern[p] == '?') 2279 { op = REnmq; // minimal munch version 2280 p++; 2281 } 2282 len = cast(uint)(buf.offset - offset); 2283 buf.spread(offset, 1 + uint.sizeof * 3); 2284 buf.data[offset] = op; 2285 uint* puint = cast(uint *)&buf.data[offset + 1]; 2286 puint[0] = len; 2287 puint[1] = n; 2288 puint[2] = m; 2289 break; 2290 2291 default: 2292 break; 2293 } 2294 return 1; 2295 2296 Lerr: 2297 error("badly formed {n,m}"); 2298 assert(0); 2299 } 2300 2301 int parseAtom() 2302 { ubyte op; 2303 size_t offset; 2304 rchar c; 2305 2306 debug(regexp) 2307 { 2308 auto sss = pattern[p .. pattern.length]; 2309 printf("parseAtom() '%.*s'\n", sss.length, sss.ptr); 2310 } 2311 if (p < pattern.length) 2312 { 2313 c = pattern[p]; 2314 switch (c) 2315 { 2316 case '*': 2317 case '+': 2318 case '?': 2319 error("*+? not allowed in atom"); 2320 p++; 2321 return 0; 2322 2323 case '(': 2324 p++; 2325 buf.write(REparen); 2326 offset = buf.offset; 2327 buf.write(cast(uint)0); // reserve space for length 2328 buf.write(re_nsub); 2329 re_nsub++; 2330 parseRegexp(); 2331 *cast(uint *)&buf.data[offset] = 2332 cast(uint)(buf.offset - (offset + uint.sizeof * 2)); 2333 if (p == pattern.length || pattern[p] != ')') 2334 { 2335 error("')' expected"); 2336 return 0; 2337 } 2338 p++; 2339 break; 2340 2341 case '[': 2342 if (!parseRange()) 2343 return 0; 2344 break; 2345 2346 case '.': 2347 p++; 2348 buf.write(REanychar); 2349 break; 2350 2351 case '^': 2352 p++; 2353 buf.write(REbol); 2354 break; 2355 2356 case '$': 2357 p++; 2358 buf.write(REeol); 2359 break; 2360 2361 case '\\': 2362 p++; 2363 if (p == pattern.length) 2364 { error("no character past '\\'"); 2365 return 0; 2366 } 2367 c = pattern[p]; 2368 switch (c) 2369 { 2370 case 'b': op = REwordboundary; goto Lop; 2371 case 'B': op = REnotwordboundary; goto Lop; 2372 case 'd': op = REdigit; goto Lop; 2373 case 'D': op = REnotdigit; goto Lop; 2374 case 's': op = REspace; goto Lop; 2375 case 'S': op = REnotspace; goto Lop; 2376 case 'w': op = REword; goto Lop; 2377 case 'W': op = REnotword; goto Lop; 2378 2379 Lop: 2380 buf.write(op); 2381 p++; 2382 break; 2383 2384 case 'f': 2385 case 'n': 2386 case 'r': 2387 case 't': 2388 case 'v': 2389 case 'c': 2390 case 'x': 2391 case 'u': 2392 case '0': 2393 c = cast(char)escape(); 2394 goto Lbyte; 2395 2396 case '1': case '2': case '3': 2397 case '4': case '5': case '6': 2398 case '7': case '8': case '9': 2399 c -= '1'; 2400 if (c < re_nsub) 2401 { buf.write(REbackref); 2402 buf.write(cast(ubyte)c); 2403 } 2404 else 2405 { error("no matching back reference"); 2406 return 0; 2407 } 2408 p++; 2409 break; 2410 2411 default: 2412 p++; 2413 goto Lbyte; 2414 } 2415 break; 2416 2417 default: 2418 p++; 2419 Lbyte: 2420 op = REchar; 2421 if (attributes & REA.ignoreCase) 2422 { 2423 if (isAlpha(c)) 2424 { 2425 op = REichar; 2426 c = cast(char)std.ascii.toUpper(c); 2427 } 2428 } 2429 if (op == REchar && c <= 0xFF) 2430 { 2431 // Look ahead and see if we can make this into 2432 // an REstring 2433 auto q = p; 2434 for (; q < pattern.length; ++q) 2435 { rchar qc = pattern[q]; 2436 2437 switch (qc) 2438 { 2439 case '{': 2440 case '*': 2441 case '+': 2442 case '?': 2443 if (q == p) 2444 goto Lchar; 2445 q--; 2446 break; 2447 2448 case '(': case ')': 2449 case '|': 2450 case '[': case ']': 2451 case '.': case '^': 2452 case '$': case '\\': 2453 case '}': 2454 break; 2455 2456 default: 2457 continue; 2458 } 2459 break; 2460 } 2461 auto len = q - p; 2462 if (len > 0) 2463 { 2464 debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]); 2465 buf.reserve(5 + (1 + len) * rchar.sizeof); 2466 buf.write((attributes & REA.ignoreCase) ? REistring : REstring); 2467 buf.write(len + 1); 2468 buf.write(c); 2469 buf.write(pattern[p .. p + len]); 2470 p = q; 2471 break; 2472 } 2473 } 2474 if (c >= 0x80) 2475 { 2476 // Convert to dchar opcode 2477 op = (op == REchar) ? REdchar : REidchar; 2478 buf.write(op); 2479 buf.write(c); 2480 } 2481 else 2482 { 2483 Lchar: 2484 debug(regexp) printf("It's an REchar '%c'\n", c); 2485 buf.write(op); 2486 buf.write(cast(char)c); 2487 } 2488 break; 2489 } 2490 } 2491 return 1; 2492 } 2493 2494 private: 2495 class Range 2496 { 2497 size_t maxc; 2498 size_t maxb; 2499 OutBuffer buf; 2500 ubyte* base; 2501 BitArray bits; 2502 2503 this(OutBuffer buf) 2504 { 2505 this.buf = buf; 2506 if (buf.data.length) 2507 this.base = &buf.data[buf.offset]; 2508 } 2509 2510 void setbitmax(size_t u) 2511 { 2512 //printf("setbitmax(x%x), maxc = x%x\n", u, maxc); 2513 if (u > maxc) 2514 { 2515 maxc = u; 2516 auto b = u / 8; 2517 if (b >= maxb) 2518 { 2519 auto u2 = base ? base - &buf.data[0] : 0; 2520 buf.fill0(b - maxb + 1); 2521 base = &buf.data[u2]; 2522 maxb = b + 1; 2523 //bits = (cast(bit*)this.base)[0 .. maxc + 1]; 2524 bits = BitArray(maxc + 1, cast(size_t*)this.base); 2525 } 2526 bits.length = maxc + 1; 2527 } 2528 } 2529 2530 void setbit2(size_t u) 2531 { 2532 setbitmax(u + 1); 2533 //printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7)); 2534 bits[u] = 1; 2535 } 2536 2537 }; 2538 2539 int parseRange() 2540 { 2541 int c; 2542 int c2; 2543 uint i; 2544 uint cmax; 2545 2546 cmax = 0x7F; 2547 p++; 2548 ubyte op = REbit; 2549 if (p == pattern.length) 2550 { 2551 error("invalid range"); 2552 return 0; 2553 } 2554 if (pattern[p] == '^') 2555 { p++; 2556 op = REnotbit; 2557 if (p == pattern.length) 2558 { 2559 error("invalid range"); 2560 return 0; 2561 } 2562 } 2563 buf.write(op); 2564 auto offset = buf.offset; 2565 buf.write(cast(uint)0); // reserve space for length 2566 buf.reserve(128 / 8); 2567 auto r = new Range(buf); 2568 if (op == REnotbit) 2569 r.setbit2(0); 2570 switch (pattern[p]) 2571 { 2572 case ']': 2573 case '-': 2574 c = pattern[p]; 2575 p++; 2576 r.setbit2(c); 2577 break; 2578 2579 default: 2580 break; 2581 } 2582 2583 enum RS { start, rliteral, dash } 2584 RS rs; 2585 2586 rs = RS.start; 2587 for (;;) 2588 { 2589 if (p == pattern.length) 2590 goto Lerr; 2591 switch (pattern[p]) 2592 { 2593 case ']': 2594 switch (rs) 2595 { case RS.dash: 2596 r.setbit2('-'); 2597 goto case; 2598 case RS.rliteral: 2599 r.setbit2(c); 2600 break; 2601 case RS.start: 2602 break; 2603 default: 2604 assert(0); 2605 } 2606 p++; 2607 break; 2608 2609 case '\\': 2610 p++; 2611 r.setbitmax(cmax); 2612 if (p == pattern.length) 2613 goto Lerr; 2614 switch (pattern[p]) 2615 { 2616 case 'd': 2617 for (i = '0'; i <= '9'; i++) 2618 r.bits[i] = 1; 2619 goto Lrs; 2620 2621 case 'D': 2622 for (i = 1; i < '0'; i++) 2623 r.bits[i] = 1; 2624 for (i = '9' + 1; i <= cmax; i++) 2625 r.bits[i] = 1; 2626 goto Lrs; 2627 2628 case 's': 2629 for (i = 0; i <= cmax; i++) 2630 if (isWhite(i)) 2631 r.bits[i] = 1; 2632 goto Lrs; 2633 2634 case 'S': 2635 for (i = 1; i <= cmax; i++) 2636 if (!isWhite(i)) 2637 r.bits[i] = 1; 2638 goto Lrs; 2639 2640 case 'w': 2641 for (i = 0; i <= cmax; i++) 2642 if (isword(cast(rchar)i)) 2643 r.bits[i] = 1; 2644 goto Lrs; 2645 2646 case 'W': 2647 for (i = 1; i <= cmax; i++) 2648 if (!isword(cast(rchar)i)) 2649 r.bits[i] = 1; 2650 goto Lrs; 2651 2652 Lrs: 2653 switch (rs) 2654 { case RS.dash: 2655 r.setbit2('-'); 2656 goto case; 2657 case RS.rliteral: 2658 r.setbit2(c); 2659 break; 2660 default: 2661 break; 2662 } 2663 rs = RS.start; 2664 continue; 2665 2666 default: 2667 break; 2668 } 2669 c2 = escape(); 2670 goto Lrange; 2671 2672 case '-': 2673 p++; 2674 if (rs == RS.start) 2675 goto Lrange; 2676 else if (rs == RS.rliteral) 2677 rs = RS.dash; 2678 else if (rs == RS.dash) 2679 { 2680 r.setbit2(c); 2681 r.setbit2('-'); 2682 rs = RS.start; 2683 } 2684 continue; 2685 2686 default: 2687 c2 = pattern[p]; 2688 p++; 2689 Lrange: 2690 switch (rs) 2691 { case RS.rliteral: 2692 r.setbit2(c); 2693 goto case; 2694 case RS.start: 2695 c = c2; 2696 rs = RS.rliteral; 2697 break; 2698 2699 case RS.dash: 2700 if (c > c2) 2701 { error("inverted range in character class"); 2702 return 0; 2703 } 2704 r.setbitmax(c2); 2705 //printf("c = %x, c2 = %x\n",c,c2); 2706 for (; c <= c2; c++) 2707 r.bits[c] = 1; 2708 rs = RS.start; 2709 break; 2710 2711 default: 2712 assert(0); 2713 } 2714 continue; 2715 } 2716 break; 2717 } 2718 if (attributes & REA.ignoreCase) 2719 { 2720 // BUG: what about dchar? 2721 r.setbitmax(0x7F); 2722 for (c = 'a'; c <= 'z'; c++) 2723 { 2724 if (r.bits[c]) 2725 r.bits[c + 'A' - 'a'] = 1; 2726 else if (r.bits[c + 'A' - 'a']) 2727 r.bits[c] = 1; 2728 } 2729 } 2730 //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb); 2731 (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc; 2732 (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb; 2733 return 1; 2734 2735 Lerr: 2736 error("invalid range"); 2737 return 0; 2738 } 2739 2740 void error(string msg) 2741 { 2742 errors++; 2743 debug(regexp) printf("error: %.*s\n", msg.length, msg.ptr); 2744 //assert(0); 2745 //*(char*)0=0; 2746 throw new RegExpException(msg); 2747 } 2748 2749 // p is following the \ char 2750 int escape() 2751 in 2752 { 2753 assert(p < pattern.length); 2754 } 2755 do 2756 { int c; 2757 int i; 2758 rchar tc; 2759 2760 c = pattern[p]; // none of the cases are multibyte 2761 switch (c) 2762 { 2763 case 'b': c = '\b'; break; 2764 case 'f': c = '\f'; break; 2765 case 'n': c = '\n'; break; 2766 case 'r': c = '\r'; break; 2767 case 't': c = '\t'; break; 2768 case 'v': c = '\v'; break; 2769 2770 // BUG: Perl does \a and \e too, should we? 2771 2772 case 'c': 2773 ++p; 2774 if (p == pattern.length) 2775 goto Lretc; 2776 c = pattern[p]; 2777 // Note: we are deliberately not allowing dchar letters 2778 if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))) 2779 { 2780 Lcerr: 2781 error("letter expected following \\c"); 2782 return 0; 2783 } 2784 c &= 0x1F; 2785 break; 2786 2787 case '0': 2788 case '1': 2789 case '2': 2790 case '3': 2791 case '4': 2792 case '5': 2793 case '6': 2794 case '7': 2795 c -= '0'; 2796 for (i = 0; i < 2; i++) 2797 { 2798 p++; 2799 if (p == pattern.length) 2800 goto Lretc; 2801 tc = pattern[p]; 2802 if ('0' <= tc && tc <= '7') 2803 { c = c * 8 + (tc - '0'); 2804 // Treat overflow as if last 2805 // digit was not an octal digit 2806 if (c >= 0xFF) 2807 { c >>= 3; 2808 return c; 2809 } 2810 } 2811 else 2812 return c; 2813 } 2814 break; 2815 2816 case 'x': 2817 c = 0; 2818 for (i = 0; i < 2; i++) 2819 { 2820 p++; 2821 if (p == pattern.length) 2822 goto Lretc; 2823 tc = pattern[p]; 2824 if ('0' <= tc && tc <= '9') 2825 c = c * 16 + (tc - '0'); 2826 else if ('a' <= tc && tc <= 'f') 2827 c = c * 16 + (tc - 'a' + 10); 2828 else if ('A' <= tc && tc <= 'F') 2829 c = c * 16 + (tc - 'A' + 10); 2830 else if (i == 0) // if no hex digits after \x 2831 { 2832 // Not a valid \xXX sequence 2833 return 'x'; 2834 } 2835 else 2836 return c; 2837 } 2838 break; 2839 2840 case 'u': 2841 c = 0; 2842 for (i = 0; i < 4; i++) 2843 { 2844 p++; 2845 if (p == pattern.length) 2846 goto Lretc; 2847 tc = pattern[p]; 2848 if ('0' <= tc && tc <= '9') 2849 c = c * 16 + (tc - '0'); 2850 else if ('a' <= tc && tc <= 'f') 2851 c = c * 16 + (tc - 'a' + 10); 2852 else if ('A' <= tc && tc <= 'F') 2853 c = c * 16 + (tc - 'A' + 10); 2854 else 2855 { 2856 // Not a valid \uXXXX sequence 2857 p -= i; 2858 return 'u'; 2859 } 2860 } 2861 break; 2862 2863 default: 2864 break; 2865 } 2866 p++; 2867 Lretc: 2868 return c; 2869 } 2870 2871 /* ==================== optimizer ======================= */ 2872 2873 void optimize() 2874 { ubyte[] prog; 2875 2876 debug(regexp) printf("RegExp.optimize()\n"); 2877 prog = buf.toBytes(); 2878 for (size_t i = 0; 1;) 2879 { 2880 //printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring); 2881 switch (prog[i]) 2882 { 2883 case REend: 2884 case REanychar: 2885 case REanystar: 2886 case REbackref: 2887 case REeol: 2888 case REchar: 2889 case REichar: 2890 case REdchar: 2891 case REidchar: 2892 case REstring: 2893 case REistring: 2894 case REtestbit: 2895 case REbit: 2896 case REnotbit: 2897 case RErange: 2898 case REnotrange: 2899 case REwordboundary: 2900 case REnotwordboundary: 2901 case REdigit: 2902 case REnotdigit: 2903 case REspace: 2904 case REnotspace: 2905 case REword: 2906 case REnotword: 2907 return; 2908 2909 case REbol: 2910 i++; 2911 continue; 2912 2913 case REor: 2914 case REnm: 2915 case REnmq: 2916 case REparen: 2917 case REgoto: 2918 { 2919 auto bitbuf = new OutBuffer; 2920 auto r = new Range(bitbuf); 2921 auto offset = i; 2922 if (starrchars(r, prog[i .. prog.length])) 2923 { 2924 debug(regexp) printf("\tfilter built\n"); 2925 buf.spread(offset, 1 + 4 + r.maxb); 2926 buf.data[offset] = REtestbit; 2927 (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc; 2928 (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb; 2929 i = offset + 1 + 4; 2930 buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb]; 2931 } 2932 return; 2933 } 2934 default: 2935 assert(0); 2936 } 2937 } 2938 } 2939 2940 ///////////////////////////////////////// 2941 // OR the leading character bits into r. 2942 // Limit the character range from 0..7F, 2943 // trymatch() will allow through anything over maxc. 2944 // Return 1 if success, 0 if we can't build a filter or 2945 // if there is no point to one. 2946 2947 int starrchars(Range r, const(ubyte)[] prog) 2948 { rchar c; 2949 uint maxc; 2950 size_t maxb; 2951 size_t len; 2952 uint b; 2953 uint n; 2954 uint m; 2955 const(ubyte)* pop; 2956 2957 //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend); 2958 for (size_t i = 0; i < prog.length;) 2959 { 2960 switch (prog[i]) 2961 { 2962 case REchar: 2963 c = prog[i + 1]; 2964 if (c <= 0x7F) 2965 r.setbit2(c); 2966 return 1; 2967 2968 case REichar: 2969 c = prog[i + 1]; 2970 if (c <= 0x7F) 2971 { r.setbit2(c); 2972 r.setbit2(std.ascii.toLower(cast(rchar)c)); 2973 } 2974 return 1; 2975 2976 case REdchar: 2977 case REidchar: 2978 return 1; 2979 2980 case REanychar: 2981 return 0; // no point 2982 2983 case REstring: 2984 len = *cast(size_t *)&prog[i + 1]; 2985 assert(len); 2986 c = *cast(rchar *)&prog[i + 1 + size_t.sizeof]; 2987 debug(regexp) printf("\tREstring %d, '%c'\n", len, c); 2988 if (c <= 0x7F) 2989 r.setbit2(c); 2990 return 1; 2991 2992 case REistring: 2993 len = *cast(size_t *)&prog[i + 1]; 2994 assert(len); 2995 c = *cast(rchar *)&prog[i + 1 + size_t.sizeof]; 2996 debug(regexp) printf("\tREistring %d, '%c'\n", len, c); 2997 if (c <= 0x7F) 2998 { r.setbit2(std.ascii.toUpper(cast(rchar)c)); 2999 r.setbit2(std.ascii.toLower(cast(rchar)c)); 3000 } 3001 return 1; 3002 3003 case REtestbit: 3004 case REbit: 3005 maxc = (cast(ushort *)&prog[i + 1])[0]; 3006 maxb = (cast(ushort *)&prog[i + 1])[1]; 3007 if (maxc <= 0x7F) 3008 r.setbitmax(maxc); 3009 else 3010 maxb = r.maxb; 3011 for (b = 0; b < maxb; b++) 3012 r.base[b] |= prog[i + 1 + 4 + b]; 3013 return 1; 3014 3015 case REnotbit: 3016 maxc = (cast(ushort *)&prog[i + 1])[0]; 3017 maxb = (cast(ushort *)&prog[i + 1])[1]; 3018 if (maxc <= 0x7F) 3019 r.setbitmax(maxc); 3020 else 3021 maxb = r.maxb; 3022 for (b = 0; b < maxb; b++) 3023 r.base[b] |= ~cast(int)prog[i + 1 + 4 + b]; 3024 return 1; 3025 3026 case REbol: 3027 case REeol: 3028 return 0; 3029 3030 case REor: 3031 len = (cast(uint *)&prog[i + 1])[0]; 3032 return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) && 3033 starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]); 3034 3035 case REgoto: 3036 len = (cast(uint *)&prog[i + 1])[0]; 3037 i += 1 + uint.sizeof + len; 3038 break; 3039 3040 case REanystar: 3041 return 0; 3042 3043 case REnm: 3044 case REnmq: 3045 // len, n, m, () 3046 len = (cast(uint *)&prog[i + 1])[0]; 3047 n = (cast(uint *)&prog[i + 1])[1]; 3048 m = (cast(uint *)&prog[i + 1])[2]; 3049 pop = &prog[i + 1 + uint.sizeof * 3]; 3050 if (!starrchars(r, pop[0 .. len])) 3051 return 0; 3052 if (n) 3053 return 1; 3054 i += 1 + uint.sizeof * 3 + len; 3055 break; 3056 3057 case REparen: 3058 // len, () 3059 len = (cast(uint *)&prog[i + 1])[0]; 3060 n = (cast(uint *)&prog[i + 1])[1]; 3061 pop = &prog[0] + i + 1 + uint.sizeof * 2; 3062 return starrchars(r, pop[0 .. len]); 3063 3064 case REend: 3065 return 0; 3066 3067 case REwordboundary: 3068 case REnotwordboundary: 3069 return 0; 3070 3071 case REdigit: 3072 r.setbitmax('9'); 3073 for (c = '0'; c <= '9'; c++) 3074 r.bits[c] = 1; 3075 return 1; 3076 3077 case REnotdigit: 3078 r.setbitmax(0x7F); 3079 for (c = 0; c <= '0'; c++) 3080 r.bits[c] = 1; 3081 for (c = '9' + 1; c <= r.maxc; c++) 3082 r.bits[c] = 1; 3083 return 1; 3084 3085 case REspace: 3086 r.setbitmax(0x7F); 3087 for (c = 0; c <= r.maxc; c++) 3088 if (isWhite(c)) 3089 r.bits[c] = 1; 3090 return 1; 3091 3092 case REnotspace: 3093 r.setbitmax(0x7F); 3094 for (c = 0; c <= r.maxc; c++) 3095 if (!isWhite(c)) 3096 r.bits[c] = 1; 3097 return 1; 3098 3099 case REword: 3100 r.setbitmax(0x7F); 3101 for (c = 0; c <= r.maxc; c++) 3102 if (isword(cast(rchar)c)) 3103 r.bits[c] = 1; 3104 return 1; 3105 3106 case REnotword: 3107 r.setbitmax(0x7F); 3108 for (c = 0; c <= r.maxc; c++) 3109 if (!isword(cast(rchar)c)) 3110 r.bits[c] = 1; 3111 return 1; 3112 3113 case REbackref: 3114 return 0; 3115 3116 default: 3117 assert(0); 3118 } 3119 } 3120 return 1; 3121 } 3122 3123 /* ==================== replace ======================= */ 3124 3125 /*********************** 3126 * After a match is found with test(), this function 3127 * will take the match results and, using the format 3128 * string, generate and return a new string. 3129 */ 3130 3131 public string replace(string format) 3132 { 3133 return replace3(format, input, pmatch[0 .. re_nsub + 1]); 3134 } 3135 3136 // Static version that doesn't require a RegExp object to be created 3137 3138 public static string replace3(string format, string input, regmatch_t[] pmatch) 3139 { 3140 string result; 3141 size_t c2; 3142 sizediff_t rm_so, rm_eo, i; 3143 3144 // printf("replace3(format = '%.*s', input = '%.*s')\n", format.length, format.ptr, input.length, input.ptr); 3145 result.length = format.length; 3146 result.length = 0; 3147 for (size_t f = 0; f < format.length; f++) 3148 { 3149 char c = format[f]; 3150 L1: 3151 if (c != '$') 3152 { 3153 result ~= c; 3154 continue; 3155 } 3156 ++f; 3157 if (f == format.length) 3158 { 3159 result ~= '$'; 3160 break; 3161 } 3162 c = format[f]; 3163 switch (c) 3164 { 3165 case '&': 3166 rm_so = pmatch[0].rm_so; 3167 rm_eo = pmatch[0].rm_eo; 3168 goto Lstring; 3169 3170 case '`': 3171 rm_so = 0; 3172 rm_eo = pmatch[0].rm_so; 3173 goto Lstring; 3174 3175 case '\'': 3176 rm_so = pmatch[0].rm_eo; 3177 rm_eo = input.length; 3178 goto Lstring; 3179 3180 case '0': case '1': case '2': case '3': case '4': 3181 case '5': case '6': case '7': case '8': case '9': 3182 i = c - '0'; 3183 if (f + 1 == format.length) 3184 { 3185 if (i == 0) 3186 { 3187 result ~= '$'; 3188 result ~= c; 3189 continue; 3190 } 3191 } 3192 else 3193 { 3194 c2 = format[f + 1]; 3195 if (c2 >= '0' && c2 <= '9') 3196 { 3197 i = (c - '0') * 10 + (c2 - '0'); 3198 f++; 3199 } 3200 if (i == 0) 3201 { 3202 result ~= '$'; 3203 result ~= c; 3204 c = cast(char)c2; 3205 goto L1; 3206 } 3207 } 3208 3209 if (i < pmatch.length) 3210 { rm_so = pmatch[i].rm_so; 3211 rm_eo = pmatch[i].rm_eo; 3212 goto Lstring; 3213 } 3214 break; 3215 3216 Lstring: 3217 if (rm_so != rm_eo) 3218 result ~= input[rm_so .. rm_eo]; 3219 break; 3220 3221 default: 3222 result ~= '$'; 3223 result ~= c; 3224 break; 3225 } 3226 } 3227 return result; 3228 } 3229 3230 /************************************ 3231 * Like replace(char[] format), but uses old style formatting: 3232 <table border=1 cellspacing=0 cellpadding=5> 3233 <th>Format 3234 <th>Description 3235 <tr> 3236 <td><b>&</b> 3237 <td>replace with the match 3238 </tr> 3239 <tr> 3240 <td><b>\</b><i>n</i> 3241 <td>replace with the <i>n</i>th parenthesized match, <i>n</i> is 1..9 3242 </tr> 3243 <tr> 3244 <td><b>\</b><i>c</i> 3245 <td>replace with char <i>c</i>. 3246 </tr> 3247 </table> 3248 */ 3249 3250 public string replaceOld(string format) 3251 { 3252 string result; 3253 3254 //printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo); 3255 //printf("3input = '%.*s'\n", input.length, input.ptr); 3256 result.length = format.length; 3257 result.length = 0; 3258 for (size_t i; i < format.length; i++) 3259 { 3260 char c = format[i]; 3261 switch (c) 3262 { 3263 case '&': 3264 { 3265 auto sss = input[pmatch[0].rm_so .. pmatch[0].rm_eo]; 3266 //printf("match = '%.*s'\n", sss.length, sss.ptr); 3267 result ~= sss; 3268 } 3269 break; 3270 3271 case '\\': 3272 if (i + 1 < format.length) 3273 { 3274 c = format[++i]; 3275 if (c >= '1' && c <= '9') 3276 { uint j; 3277 3278 j = c - '0'; 3279 if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo) 3280 result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo]; 3281 break; 3282 } 3283 } 3284 result ~= c; 3285 break; 3286 3287 default: 3288 result ~= c; 3289 break; 3290 } 3291 } 3292 return result; 3293 } 3294 3295 } 3296 3297 unittest 3298 { // Created and placed in public domain by Don Clugston 3299 3300 auto m = search("aBC r s", `bc\x20r[\40]s`, "i"); 3301 assert(m.pre=="a"); 3302 assert(m[0]=="BC r s"); 3303 auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`); 3304 assert(m2[0]=="7xxyxx"); 3305 // Just check the parsing. 3306 auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`); 3307 auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`); 3308 auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`); 3309 auto m6 = search("xxy", `.*y`); 3310 assert(m6[0]=="xxy"); 3311 auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i"); 3312 assert(m7[0]=="DEfG"); 3313 auto m8 = search("dcbxx", `a?\B\s\S`); 3314 auto m9 = search("dcbxx", `[-w]`); 3315 auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i"); 3316 auto m11 = search("dcbsfd", `[]a-]`); 3317 m.replaceOld(`a&b\1c`); 3318 m.replace(`a$&b$'$1c`); 3319 } 3320 3321 // Andrei 3322 //------------------------------------------------------------------------------ 3323 3324 struct Pattern(Char) 3325 { 3326 immutable(Char)[] pattern; 3327 3328 this(immutable(Char)[] pattern) 3329 { 3330 this.pattern = pattern; 3331 } 3332 } 3333 3334 Pattern!(Char) pattern(Char)(immutable(Char)[] pat) 3335 { 3336 return typeof(return)(pat); 3337 } 3338 3339 struct Splitter(Range) 3340 { 3341 Range _input; 3342 size_t _chunkLength; 3343 RegExp _rx; 3344 3345 private Range search() 3346 { 3347 //rx = undead.regexp.search(_input, "(" ~ _separator.pattern ~ ")"); 3348 auto i = undead.regexp.find(cast(string) _input, _rx); 3349 return _input[i >= 0 ? i : _input.length .. _input.length]; 3350 } 3351 3352 private void advance() 3353 { 3354 //writeln("(" ~ _separator.pattern ~ ")"); 3355 //writeln(_input); 3356 //assert(_rx[0].length > 0); 3357 _chunkLength += _rx[0].length; 3358 } 3359 3360 this(Range input, Pattern!(char) separator) 3361 { 3362 _input = input; 3363 _rx = RegExp(separator.pattern); 3364 _chunkLength = _input.length - search().length; 3365 } 3366 3367 ref auto opSlice() 3368 { 3369 return this; 3370 } 3371 3372 @property Range front() 3373 { 3374 return _input[0 .. _chunkLength]; 3375 } 3376 3377 @property bool empty() 3378 { 3379 return _input.empty; 3380 } 3381 3382 void popFront() 3383 { 3384 if (_chunkLength == _input.length) 3385 { 3386 _input = _input[_chunkLength .. _input.length]; 3387 return; 3388 } 3389 advance(); 3390 _input = _input[_chunkLength .. _input.length]; 3391 _chunkLength = _input.length - search().length; 3392 } 3393 } 3394 3395 Splitter!(Range) splitter(Range)(Range r, Pattern!(char) pat) 3396 { 3397 static assert(is(Unqual!(typeof(Range.init[0])) == char), 3398 Unqual!(typeof(Range.init[0])).stringof); 3399 return typeof(return)(cast(string) r, pat); 3400 } 3401 3402 unittest 3403 { 3404 auto s1 = ", abc, de, fg, hi, "; 3405 auto sp2 = splitter(s1, pattern(", *")); 3406 //foreach (e; sp2) writeln("[", e, "]"); 3407 assert(equal(sp2, ["", "abc", "de", "fg", "hi"][])); 3408 } 3409 3410 unittest 3411 { 3412 auto str= "foo"; 3413 string[] re_strs= [ 3414 r"^(h|a|)fo[oas]$", 3415 r"^(a|b|)fo[oas]$", 3416 r"^(a|)foo$", 3417 r"(a|)foo", 3418 r"^(h|)foo$", 3419 r"(h|)foo", 3420 r"(h|a|)fo[oas]", 3421 r"^(a|b|)fo[o]$", 3422 r"[abf][ops](o|oo|)(h|a|)", 3423 r"(h|)[abf][ops](o|oo|)", 3424 r"(c|)[abf][ops](o|oo|)" 3425 ]; 3426 3427 foreach (re_str; re_strs) { 3428 auto re= new RegExp(re_str); 3429 auto matches= cast(bool)re.test(str); 3430 assert(matches); 3431 //writefln("'%s' matches '%s' ? %s", str, re_str, matches); 3432 } 3433 3434 for (char c='a'; c<='z'; ++c) { 3435 auto re_str= "("~c~"|)foo"; 3436 auto re= new RegExp(re_str); 3437 auto matches= cast(bool)re.test(str); 3438 assert(matches); 3439 //writefln("'%s' matches '%s' ? %s", str, re_str, matches); 3440 } 3441 }