[Gd-chatter] r11510 - in trunk/libraries/regular-expressions: . tests

cgay at gwydiondylan.org cgay at gwydiondylan.org
Sun Dec 2 15:16:49 CET 2007


Author: cgay
Date: Sun Dec  2 15:16:47 2007
New Revision: 11510

Added:
   trunk/libraries/regular-expressions/tests/pcre.dylan   (contents, props changed)
Removed:
   trunk/libraries/regular-expressions/tests/new-api-test-suite.dylan
   trunk/libraries/regular-expressions/tests/old-api-test-suite.dylan
Modified:
   trunk/libraries/regular-expressions/interface.dylan
   trunk/libraries/regular-expressions/match.dylan
   trunk/libraries/regular-expressions/od-library.dylan
   trunk/libraries/regular-expressions/parse.dylan
   trunk/libraries/regular-expressions/regex.dylan
   trunk/libraries/regular-expressions/tests/library.dylan
   trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
   trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
Log:
Bug: 7357
Got rid of the whole new/old API mess, and some API tweaks.

Modified: trunk/libraries/regular-expressions/interface.dylan
==============================================================================
--- trunk/libraries/regular-expressions/interface.dylan	(original)
+++ trunk/libraries/regular-expressions/interface.dylan	Sun Dec  2 15:16:47 2007
@@ -1,4 +1,4 @@
-module:   regular-expressions-impl
+module:   regular-expressions
 author:   Nick Kramer (nkramer at cs.cmu.edu)
 synopsis: This provides a useable interface for users. Functions 
 	  defined outside this file are really too strange and quirky 
@@ -32,25 +32,22 @@
 //
 //======================================================================
 
-// Functions that aren't exported are marked as such.  Everything else
-// is exported.
-//
 // There are quite a few make-fooer functions hanging around.  Now
-// that regexp-position does caching, these are basically useless, but
+// that regex-position does caching, these are basically useless, but
 // we've kept them around for backwards compatibility.  Unfortunately,
 // internally most of the functions are implemented in terms of
-// make-regexp-positioner.  To minimize the amount of rewriting, I've
+// make-regex-positioner.  To minimize the amount of rewriting, I've
 // liberally applied seals and inline declarations so that
-// make-regexp-positioner won't clobber all type information.  The
+// make-regex-positioner won't clobber all type information.  The
 // downside, of course, is that everything's sealed, but hey, no one
-// ever subclassed regexp-position anyway.
+// ever subclassed regex-position anyway.
 
 
 // Caching
 //
-// Parsing a regexp is not cheap, so we cache the parsed regexps and
+// Parsing a regex is not cheap, so we cache the parsed regexs and
 // only parse a string if we haven't seen it before.  Because in
-// practice almost all regexp strings are string literals, we're free
+// practice almost all regex strings are string literals, we're free
 // to choose == or = depending on whatever's fastest.  However,
 // because a string is parsed differently depending on whether the
 // search is case sensitive or not, we also have to keep track of that
@@ -63,116 +60,116 @@
 // attributes such as case-sensitivity mid-parse, the way (I believe) perl
 // does?  --cgay
 
-// ### Currently, only regexp-position uses this cache, because the
-// other functions are still using make-regexp-positioner.  With
-// caching, that make-regexp-whatever stuff should probably go.
+// ### Currently, only regex-position uses this cache, because the
+// other functions are still using make-regex-positioner.  With
+// caching, that make-regex-whatever stuff should probably go.
 
 // <cache-key> -- internal
 //
-// What we use for keys in the *regexp-cache*.
+// What we use for keys in the *regex-cache*.
 //
 define class <cache-key> (<object>)
-  constant slot regexp-string :: <string>, 
-    required-init-keyword: #"regexp-string";
+  constant slot regex-string :: <string>, 
+    required-init-keyword: #"regex-string";
   constant slot character-set-type :: <class>, 
     required-init-keyword: #"character-set-type";
 end class <cache-key>;
 
 // <cache-element> -- internal
 //
-// What we use for elements in a *regexp-cache*
+// What we use for elements in a *regex-cache*
 //
 define class <cache-element> (<object>)
-  constant slot parse-tree :: <parsed-regexp>,
+  constant slot parse-tree :: <parsed-regex>,
     required-init-keyword: #"parse-tree";
   constant slot last-group :: <integer>,
     required-init-keyword: #"last-group";
 end class <cache-element>;
 
-// <regexp-cache> -- internal
+// <regex-cache> -- internal
 //
 // Maps <cache-key> to <cache-element>.  ### Ideally, we'd be using
 // weak pointers to these strings.  In practice, however, most of the
-// regexp strings are literals, so this isn't usually a drawback.
+// regex strings are literals, so this isn't usually a drawback.
 //
 // This used to compare strings with == rather than =, but this leaks
 // lots of memory
 // 
-define class <regexp-cache> (<table>) end;
+define class <regex-cache> (<table>) end;
 
-// table-protocol{<regexp-cache>} -- method on imported G.F.
+// table-protocol{<regex-cache>} -- method on imported G.F.
 //
-define method table-protocol (table :: <regexp-cache>) 
+define method table-protocol (table :: <regex-cache>) 
  => (equal? :: <function>, hash :: <function>);
   values(method (key1 :: <cache-key>, key2 :: <cache-key>) // equal?
 	  => res :: <boolean>;
-           key1.regexp-string = key2.regexp-string
+           key1.regex-string = key2.regex-string
              & key1.character-set-type == key2.character-set-type;
 	 end method,
 	 method (key :: <cache-key>, initial-state) => (id :: <integer>, state); // hash()
-	   let (string-id, string-state) = object-hash(key.regexp-string, initial-state);
+	   let (string-id, string-state) = object-hash(key.regex-string, initial-state);
 	   let (set-type-id, set-type-state) 
 	     = object-hash(key.character-set-type, string-state);
 	   values(merge-hash-ids(string-id, set-type-id, ordered: #t), set-type-state);
 	 end method);
 end method table-protocol;
 
-// *regexp-cache* -- internal
+// *regex-cache* -- internal
 //
-// The only instance of <regexp-cache>.  ### Not threadsafe.
+// The only instance of <regex-cache>.  ### Not threadsafe.
 // 
 // Technically not thread safe, but does it matter?  Worst case seems to
-// be a duplicated regexp parse.  --cgay
+// be a duplicated regex parse.  --cgay
 //
-define constant *regexp-cache* = make(<regexp-cache>);
+define constant *regex-cache* = make(<regex-cache>);
 
 // parse-or-use-cached -- internal
 //
-// Tries to use the cached version of the regexp, and if not possible,
+// Tries to use the cached version of the regex, and if not possible,
 // parses it and adds it to the cache.
 //
 define inline function parse-or-use-cached 
-    (regexp :: <string>, parse-info :: <parse-info>)
- => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>);
-  let key = make(<cache-key>, regexp-string: regexp, 
+    (regex :: <string>, parse-info :: <parse-info>)
+ => (parsed-regex :: <parsed-regex>, last-group :: <integer>);
+  let key = make(<cache-key>, regex-string: regex, 
 		 character-set-type: parse-info.set-type); 
-  let cached-value = element(*regexp-cache*, key, default: #f);
+  let cached-value = element(*regex-cache*, key, default: #f);
   if (cached-value)
     values(cached-value.parse-tree, cached-value.last-group);
   else
-    let (parsed-regexp, last-group) = parse(regexp, parse-info);
-    *regexp-cache*[key] := make(<cache-element>,
-                                parse-tree: parsed-regexp,
+    let (parsed-regex, last-group) = parse(regex, parse-info);
+    *regex-cache*[key] := make(<cache-element>,
+                                parse-tree: parsed-regex,
 				last-group: last-group);
-    values(parsed-regexp, last-group);
+    values(parsed-regex, last-group);
   end if;
 end function parse-or-use-cached;
 
 
-// Regexp positioner stuff
+// regex positioner stuff
 
 // Find the position of a regular expression inside a string.  If the
-// regexp is not found, return #f, otherwise return a variable number
+// regex is not found, return #f, otherwise return a variable number
 // of marks.
 //
-define function regexp-position
-    (big :: <string>, regexp :: <string>, #key start: big-start = 0,
+define function regex-position
+    (big :: <string>, regex :: <string>, #key start: big-start = 0,
      end: big-end = #f, case-sensitive = #f)
- => (regexp-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
+ => (regex-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
   let substring = make(<substring>, string: big, start: big-start,
 		       end: big-end | big.size);
-  let (parsed-regexp, last-group) 
-    = parse-or-use-cached(regexp, make-parse-info(case-sensitive: case-sensitive));
+  let (parsed-regex, last-group) 
+    = parse-or-use-cached(regex, make-parse-info(case-sensitive: case-sensitive));
 
   let (matched, marks)
-    = if (parsed-regexp.is-anchored?)
-	anchored-match-root?(parsed-regexp, substring, case-sensitive,
+    = if (parsed-regex.is-anchored?)
+	anchored-match-root?(parsed-regex, substring, case-sensitive,
 			     last-group + 1, #f);
       else
-	let initial = parsed-regexp.initial-substring;
+	let initial = parsed-regex.initial-substring;
 	let searcher = ~initial.empty?
 	  & make-substring-positioner(initial, case-sensitive: case-sensitive);
-	match-root?(parsed-regexp, substring, case-sensitive, last-group + 1,
+	match-root?(parsed-regex, substring, case-sensitive, last-group + 1,
 		    searcher);
       end if;
   if (matched)  
@@ -180,31 +177,32 @@
   else
     #f  
   end if;
-end function regexp-position;
+end function regex-position;
 
 // Once upon a time, this was how you interfaced to the NFA stuff
 // (maximum-compile: #t).  That's gone.  Now it's just here for
 // backwards compatibility.  All keywords except case-sensitive are
 // now ignored.
 //
-define inline function make-regexp-positioner
-    (regexp :: <string>, 
+define inline function make-regex-positioner
+    (regex :: <string>, 
      #key byte-characters-only = #f, need-marks = #t, maximum-compile = #f,
      case-sensitive = #f)
- => regexp-positioner :: <function>;
+ => regex-positioner :: <function>;
   method (big :: <string>, #key start: big-start = 0,
 	  end: big-end = #f)
-   => (regexp-start :: false-or(<integer>), 
+   => (regex-start :: false-or(<integer>), 
        #rest marks :: false-or(<integer>));
-    regexp-position(big, regexp, case-sensitive: case-sensitive, 
+    regex-position(big, regex, case-sensitive: case-sensitive, 
 		    start: big-start, end: big-end);
   end method;
-end function make-regexp-positioner;
+end function make-regex-positioner;
 
 // returns #f if no match, the matching string on match, and another string or #f
-// for each group in the regexp.
-define method regexp-match(big :: <string>, regex :: <string>) => (#rest results);
-  let (#rest marks) = regexp-position(big, regex);
+// for each group in the regex.
+define method regex-match
+    (big :: <string>, regex :: <string>) => (#rest results);
+  let (#rest marks) = regex-position(big, regex);
   let result = make(<stretchy-vector>);
 
   if(marks[0])
@@ -223,26 +221,26 @@
 
 // #if (have-free-time)
 /*
-// regexp-matches -- exported
+// regex-matches -- exported
 //
-// A more convenient form of regexp-position.  Usually you want
+// A more convenient form of regex-position.  Usually you want
 // substrings that were matched by a group rather than the marks for
 // the group.  How you use this is you give the group numbers you
 // want, and it'll give you the strings.  (#f if that group wasn't
 // matched)
 //
-define function regexp-matches
-    (big :: <string>, regexp :: <string>,
+define function regex-matches
+    (big :: <string>, regex :: <string>,
      #key start: start-index :: <integer> = 0,
           end: end-index :: false-or(<integer>),
           case-sensitive :: <boolean> = #f,
           groups :: false-or(<sequence>))
  => (#rest group-strings :: false-or(<string>));
   if (~groups)
-    error("Mandatory keyword groups: not used in call to regexp-matches");
+    error("Mandatory keyword groups: not used in call to regex-matches");
   end if;
   let (#rest marks)
-    = regexp-position(big, regexp, start: start-index, end: end-index, 
+    = regex-position(big, regex, start: start-index, end: end-index, 
 		      case-sensitive: case-sensitive);
   let return-val = make(<vector>, size: groups.size, fill: #f);
   for (index from 0 below return-val.size)
@@ -278,23 +276,23 @@
 */
 
 
-// Functions based on regexp-position
+// Functions based on regex-position
 
-define function regexp-replace
-    (input :: <string>, regexp :: <string>, new-substring :: <string>,
+define function regex-replace
+    (input :: <string>, regex :: <string>, new-substring :: <string>,
      #key count = #f, case-sensitive = #f, start = 0, end: input-end = #f)
  => changed-string :: <string>;
   let positioner
-    = make-regexp-positioner(regexp, case-sensitive: case-sensitive);
+    = make-regex-positioner(regex, case-sensitive: case-sensitive);
   do-replacement(positioner, new-substring, input, start, 
 		 input-end, count, #t);
-end function regexp-replace;
+end function regex-replace;
 
-define inline function make-regexp-replacer 
-    (regexp :: <string>, #key replace-with, case-sensitive = #f)
+define inline function make-regex-replacer 
+    (regex :: <string>, #key replace-with, case-sensitive = #f)
  => replacer :: <function>;
   let positioner
-    = make-regexp-positioner(regexp, case-sensitive: case-sensitive);
+    = make-regex-positioner(regex, case-sensitive: case-sensitive);
   if (replace-with)
     method (input :: <string>, #key count: count, 
 	    start = 0, end: input-end = #f)
@@ -310,141 +308,7 @@
 		     start, input-end, count, #t);
     end method;
   end if;
-end function make-regexp-replacer;
-
-// equivalent of Perl's tr.  Does a character by character translation.
-//
-define open generic translate
-    (input :: <string>, from-set :: <string>, to-set :: <string>, 
-     #key delete, start, end: the-end)
- => output :: <string>;
-
-//The existing methods only work on byte-strings.
-//
-define method translate
-    (input :: <byte-string>, from-set :: <byte-string>,
-     to-set :: <byte-string>,
-     #key delete: delete = #f, start = 0, end: input-end = #f)
- => output :: <byte-string>;
-  let table = make-translation-table(from-set, to-set, delete: delete);
-  run-translator(input, table, start, input-end | size(input));
-end method translate;
-
-define open generic make-translator
-    (from-set :: <string>, to-set :: <string>, #key delete)
- => translator :: <function>;
-
-// Again, only byte-strings handled here
-//
-define method make-translator
-    (from-set :: <byte-string>, to-set :: <byte-string>,
-     #key delete: delete = #f)
- => translator :: <function>;
-  let table = make-translation-table(from-set, to-set, delete: delete);
-  method (input :: <byte-string>, #key start = 0, end: input-end = #f)
-   => output :: <byte-string>;
-    run-translator(input, table, start, input-end | size(input));
-  end method;
-end method make-translator;
-
-// Used by translate.  Not exported.
-//
-define function make-translation-table
-    (from-set :: <byte-string>, to-set :: <byte-string>,
-     #key delete: delete = #f)
- => table :: <byte-character-table>;
-  let from-index = 0;
-  let to-index = 0;
-  let previous-from = #f;
-  let previous-to = #f;
-
-     // These local methods are identical except for the 
-     // choice of variable names and next-from-character signals end of
-     // string rather than repeating the last character indefinitely like
-     // next-to-character does.
-  local method next-from-character ()
-	  if (from-index >= size(from-set))
-	    #f;
-	  elseif (from-set[from-index] = '\\')
-	    from-index := from-index + 2;
-	    previous-from := from-set[from-index - 1];
-	  elseif (from-set[from-index] = '-')
-	    if (previous-from = from-set[from-index + 1])
-	      from-index := from-index + 1;
-	      from-set[from-index];
-	    else
-	      previous-from := successor(previous-from); 
-	              // and return that value
-	    end if;
-	  else
-	    from-index := from-index + 1;
-	    previous-from := from-set[from-index - 1];
-	  end if;
-	end method next-from-character;
-
-  local method next-to-character ()
-	  if (to-index >= size(to-set))
-	    if (delete)  #f  else  last(to-set)  end;
-	  elseif (to-set[to-index] = '\\')
-	    to-index := to-index + 2;
-	    previous-to := to-set[to-index - 1];
-	  elseif (to-set[to-index] = '-')
-	    if (previous-to = to-set[to-index + 1])
-	      to-index := to-index + 1;
-	      to-set[to-index];
-	    else
-	      previous-to := successor(previous-to); 
-	              // and return that value
-	    end if;
-	  else
-	    to-index := to-index + 1;
-	    previous-to := to-set[to-index - 1];
-	  end if;
-	end method next-to-character;
-
-  let table = make(<byte-character-table>);
-  // Wish I had keyed-by
-  let (state, limit, next, done?, cur-key, cur-elem)
-    = forward-iteration-protocol(table);
-  for (st = state then next(table, st), until: done?(table, st, limit))
-    let c = cur-key(table, st);
-    table[c] := c;
-  end for;
-
-  for (from-char = next-from-character() then next-from-character(),
-       to-char = next-to-character() then next-to-character(),
-       until: from-char = #f)
-    table[from-char] := to-char;
-  end for;
-
-  table;
-end function make-translation-table;
-
-// Used by translate.  Not exported.
-//
-define function run-translator
-    (source :: <byte-string>, table :: <byte-character-table>, 
-     start-index :: <integer>, end-index :: <integer>)
- => output :: <byte-string>;
-  let dest-string = copy-sequence(source);
-  let dest-index = start-index;
-  for (source-index from start-index below end-index)
-    let char = source[source-index];
-    if (table[char] ~== #f)
-      dest-string[dest-index] := table[char];
-      dest-index := dest-index + 1;
-    end if;
-  end for;
-
-      // Now resize dest-string, because deleting characters in the
-      // translation would make dest-string shorter than we've
-      // allocated.
-  if (dest-index = end-index)
-    dest-string;
-  else
-    replace-subsequence!(dest-string, "", start: dest-index, end: end-index);
-  end if;
-end function run-translator;
+end function make-regex-replacer;
 
 // Like Perl's split function
 //
@@ -452,14 +316,14 @@
     (pattern :: <string>, input :: <string>, 
      #key count = #f, remove-empty-items = #t, start = 0, end: input-end = #f)
  => (#rest whole-bunch-of-strings :: <string>);
-  let positioner = make-regexp-positioner(pattern);
+  let positioner = make-regex-positioner(pattern);
   split-string(positioner, input, start, input-end | size(input),
 	       count, remove-empty-items);
 end function split;
 
 define inline function make-splitter
     (pattern :: <string>) => splitter :: <function>;
-  let positioner = make-regexp-positioner(pattern);
+  let positioner = make-regex-positioner(pattern);
   method (string :: <string>, #key count = #f,
 	  remove-empty-items = #t, start = 0, end: input-end = #f)
    => (#rest whole-bunch-of-strings :: <string>);
@@ -542,3 +406,5 @@
   end if;
   big-string;
 end function join;
+
+

Modified: trunk/libraries/regular-expressions/match.dylan
==============================================================================
--- trunk/libraries/regular-expressions/match.dylan	(original)
+++ trunk/libraries/regular-expressions/match.dylan	Sun Dec  2 15:16:47 2007
@@ -1,4 +1,4 @@
-module:   regular-expressions-impl
+module:   regular-expressions
 author:   Nick Kramer (nkramer at cs.cmu.edu)
 synopsis: This takes a parsed regular expression and tries to find a match
           for it.
@@ -36,7 +36,7 @@
 // Details of match:
 
 // This whole thing is rather hairy.  Basically, it creates a "path"
-// through the regexp parse tree that corresponds to a match of the
+// through the regex parse tree that corresponds to a match of the
 // string.  A path is a round trip through a parse tree that starts
 // and ends at the root. The part of the path already travelled is the
 // call stack, and hints about the untravelled part of the path are
@@ -58,10 +58,10 @@
 // its own non-local exit so that it can try to match its part
 // differently.
 
-// As an example, a <union> is "regexp #1 or regexp #2".  When
+// As an example, a <union> is "regex #1 or regex #2".  When
 // descend-re(<union>...) is called, it'll set up a non-local exit and
-// then descend-re on regexp #1.  If someone backtracks out of regexp
-// #1, descend-re(<union>) will try regexp #2.  If someone backtracks
+// then descend-re on regex #1.  If someone backtracks out of regex
+// #1, descend-re(<union>) will try regex #2.  If someone backtracks
 // out of that, descend-re(<union>) will give up and backtrack itself.
 
 // When this chain of functions completes a match, it'll stumble upon
@@ -80,7 +80,7 @@
 // Match-root?: Set things up and call descend-re.
 //
 define method match-root?
-    (re :: <parsed-regexp>, target :: <substring>,
+    (re :: <parsed-regex>, target :: <substring>,
      case-sensitive? :: <boolean>, num-groups :: <integer>,
      searcher :: false-or(<function>))
  => (answer :: <boolean>, marks :: <sequence>);
@@ -107,7 +107,7 @@
 	    block (fail)
 	      descend-re(re, target, case-sensitive?, index,
 			 marks, fail, list(up-proc));
-	      error("A regexp should either match or not match. Why did it "
+	      error("A regex should either match or not match. Why did it "
                     "reach this piece of code?");
 	    end block;
 	  end for;
@@ -117,7 +117,7 @@
 	    block (fail)
 	      descend-re(re, target, case-sensitive?, index,
 			 marks, fail, list(up-proc));
-	      error("A regexp should either match or not match. Why did "
+	      error("A regex should either match or not match. Why did "
 		      "it reach this piece of code?");
 	    end block;
 	  end for;
@@ -131,7 +131,7 @@
 // starts with "^".
 //
 define method anchored-match-root?
-    (re :: <parsed-regexp>, target :: <substring>,
+    (re :: <parsed-regex>, target :: <substring>,
      case-sensitive? :: <boolean>, num-groups :: <integer>,
      searcher :: false-or(<function>))
  => (answer :: <boolean>, marks :: <sequence>);
@@ -147,7 +147,7 @@
 	block (fail)
 	  descend-re(re, target, case-sensitive?, target.start-index,
 		     marks, fail, list(up-proc));
-	  error("A regexp should either match or not match. Why did it "
+	  error("A regex should either match or not match. Why did it "
 		  "reach this piece of code?");
 	end block;
 	values(#f);      // Failure
@@ -156,7 +156,7 @@
 end method anchored-match-root?;
 
 define generic descend-re
-    (re :: false-or(<parsed-regexp>), target :: <substring>,
+    (re :: false-or(<parsed-regex>), target :: <substring>,
      case-sensitive? :: <boolean>, start-index :: <integer>,
      marks :: <mutable-sequence>, backtrack-past-me :: <non-local-exit>,
      up-list :: <list> /* of <non-local-exit> */) => ();
@@ -269,7 +269,7 @@
      marks :: <mutable-sequence>, backtrack-past-me :: <non-local-exit>,
      up-list :: <list>) => ();
   local method descend-and-quantify (min :: <integer>, max, 
-				     re :: <parsed-regexp>, index :: <integer>,
+				     re :: <parsed-regex>, index :: <integer>,
 				     backtrack-past-me :: <non-local-exit>,
 				     up-list)
 

Modified: trunk/libraries/regular-expressions/od-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/od-library.dylan	(original)
+++ trunk/libraries/regular-expressions/od-library.dylan	Sun Dec  2 15:16:47 2007
@@ -31,52 +31,19 @@
 //
 //======================================================================
 
-
-// Added regex module with new API.  --cgay, June 2007
+// Revamped.  --cgay Dec 2007
 
 define library regular-expressions
   use common-dylan;
   use string-extensions;
   export
-    regexp,                                            // new API
-    regular-expressions;                               // old API
-end library regular-expressions;
-
-define module regexp                  // new API module
-  create
-    compile-regexp,
-    regexp-search,
-    <regexp>,
-    <regexp-error>,
-    <invalid-regexp>,
-      regexp-pattern,
-    <regexp-match>,                   // results of a successful search
-      regexp-match-group,
-      regexp-match-groups,
-      group-start,
-      group-end,
-      group-text,
-      <invalid-match-group>;
-end module regexp;
-
-define module regular-expressions    // old API module
-  create
-    regexp-position, make-regexp-positioner,
-    regexp-match,
-    regexp-replace, make-regexp-replacer,
-    regexp-group-count,
-    translate, make-translator,
-    split, make-splitter,
-    join,
-    <illegal-regexp>;
-  create
-    split-string;
-end module regular-expressions;
+    regular-expressions;
+end;
 
-define module regular-expressions-impl
+define module regular-expressions
   use common-dylan,
     exclude: {
-      split
+      split    // todo -- just add a method to this one
     };
   use string-conversions;
   use character-type;
@@ -84,7 +51,32 @@
   use %do-replacement;
   use %parse-string;
   use substring-search;
-  use regular-expressions;                      // old API module
-  use regexp;                                   // new API module
-end module regular-expressions-impl;
+  export
+    compile-regex,
+    <regex>,
+      regex-search,
+      regex-group-count,
+      regex-position,
+      make-regex-positioner,
+      regex-match,  // todo -- rename to regex-search-strings?
+      regex-replace,
+      make-regex-replacer,
+    <regex-error>,
+    <invalid-regex>,
+      regex-pattern,
+    <regex-match>,                   // results of a successful search
+      <match-group>,
+      match-group,
+      match-groups,
+      group-start,
+      group-end,
+      group-text,
+      <invalid-match-group>,
+
+    split,
+    make-splitter,
+    join;
+  export
+    split-string;   // ???
+end module regular-expressions;
 

Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan	(original)
+++ trunk/libraries/regular-expressions/parse.dylan	Sun Dec  2 15:16:47 2007
@@ -1,4 +1,4 @@
-module: regular-expressions-impl
+module: regular-expressions
 author: Nick Kramer (nkramer at cs.cmu.edu)
 copyright: see below
 
@@ -31,7 +31,7 @@
 
 // This is a program to parse regular expressions. The grammar I'm using is:
 //
-//      <regexp> ::= <alternative> | <alternative>|<regexp>
+//      <regex> ::= <alternative> | <alternative>|<regex>
 //
 //      <alternative> ::= <quantified-atom> | <quantified-atom><alternative>
 //
@@ -42,7 +42,7 @@
 //
 //      <atom> ::= <subpattern> | <extended-character>
 //
-//      <subpattern> ::= (<options> <regexp>)
+//      <subpattern> ::= (<options> <regex>)
 //
 //      <options> ::= ?: | ?P<name> | ?P=name | ?# | etc
 //
@@ -53,51 +53,51 @@
 // expression component.  Match.dylan could go into an infinite loop
 // if given this.
 
-define abstract class <parsed-regexp> (<object>)
-end class <parsed-regexp>;
+define abstract class <parsed-regex> (<object>)
+end class <parsed-regex>;
 
-define class <mark> (<parsed-regexp>)
-  slot child :: <parsed-regexp>,  required-init-keyword: #"child";
+define class <mark> (<parsed-regex>)
+  slot child :: <parsed-regex>,  required-init-keyword: #"child";
   constant slot group-number :: <integer>, required-init-keyword: #"group";
 end class <mark>;
 
-// The root of the parsed regexp, i.e., this is what's returned by the parser.
-define class <regexp> (<mark>)
+// The root of the parsed regex, i.e., this is what's returned by the parser.
+define class <regex> (<mark>)
   // exported
-  constant slot regexp-pattern :: <string>,
+  constant slot regex-pattern :: <string>,
     required-init-keyword: pattern:;
   // exported
-  constant slot regexp-group-count :: <integer>,
+  constant slot regex-group-count :: <integer>,
     required-init-keyword: group-count:;
-  // internal.  This is only needed when making a <regexp-match> after
+  // internal.  This is only needed when making a <regex-match> after
   // a successful search.
   constant slot group-number-to-name :: <table>,
     required-init-keyword: group-number-to-name:;
-end class <regexp>;
+end class <regex>;
 
-define class <union> (<parsed-regexp>)          //    |
-  slot left  :: <parsed-regexp>, required-init-keyword: #"left";
-  slot right :: <parsed-regexp>, required-init-keyword: #"right";
+define class <union> (<parsed-regex>)          //    |
+  slot left  :: <parsed-regex>, required-init-keyword: #"left";
+  slot right :: <parsed-regex>, required-init-keyword: #"right";
 end class <union>;
 
-define class <alternative> (<parsed-regexp>)    // concatenation
-  slot left :: <parsed-regexp>,  required-init-keyword: #"left";
-  slot right :: <parsed-regexp>, required-init-keyword: #"right";
+define class <alternative> (<parsed-regex>)    // concatenation
+  slot left :: <parsed-regex>,  required-init-keyword: #"left";
+  slot right :: <parsed-regex>, required-init-keyword: #"right";
 end class <alternative>;
 
-define class <parsed-assertion> (<parsed-regexp>)
+define class <parsed-assertion> (<parsed-regex>)
   constant slot asserts :: <symbol>, required-init-keyword: #"assertion";
 end class <parsed-assertion>;
 
-define class <quantified-atom> (<parsed-regexp>)
-  slot atom :: <parsed-regexp>, required-init-keyword: #"atom";
+define class <quantified-atom> (<parsed-regex>)
+  slot atom :: <parsed-regex>, required-init-keyword: #"atom";
   constant slot min-matches :: <integer>, init-value: 0,
     init-keyword: #"min";
   constant slot max-matches :: false-or(<integer>), init-value: #f, 
     init-keyword: #"max";
 end class <quantified-atom>;
 
-define abstract class <parsed-atom> (<parsed-regexp>)
+define abstract class <parsed-atom> (<parsed-regex>)
 end class <parsed-atom>;
 
 define class <parsed-character> (<parsed-atom>)
@@ -117,21 +117,21 @@
 end class <parsed-backreference>;
 
 // Note: I'm pretty sure <simple-error> won't work in GD.  --cgay
-define class <regexp-error> (<simple-error>)
-end class <regexp-error>;
+define class <regex-error> (<simple-error>)
+end class <regex-error>;
 
-define class <illegal-regexp> (<regexp-error>)
-  constant slot regexp-pattern :: <string>, 
+define class <illegal-regex> (<regex-error>)
+  constant slot regex-pattern :: <string>, 
     required-init-keyword: #"pattern";
-end class <illegal-regexp>;
+end class <illegal-regex>;
 
-define sealed domain make (singleton(<illegal-regexp>));
-define sealed domain initialize (<illegal-regexp>);
+define sealed domain make (singleton(<illegal-regex>));
+define sealed domain initialize (<illegal-regex>);
 
 define function parse-error
     (pattern :: <string>, format-string :: <string>, #rest format-args)
   let msg = apply(format-to-string, format-string, format-args);
-  signal(make(<illegal-regexp>,
+  signal(make(<illegal-regex>,
               format-string: "Invalid regular expression: %=.  %s",
               format-arguments: list(pattern, msg),
               pattern: pattern));
@@ -143,16 +143,16 @@
               else
                 thing
               end;
-  signal(make(<regexp-error>,
+  signal(make(<regex-error>,
               format-string: "The %s is not yet implemented.",
               format-arguments: list(thing)));
 end;
 
-// <parse-info> contains some information about the current regexp
+// <parse-info> contains some information about the current regex
 // being parsed.
 //
 define class <parse-info> (<object>)
-  // Whether or not the function includes \1, \2, etc in the regexp.
+  // Whether or not the function includes \1, \2, etc in the regex.
   // Name this has-backreferences, for consistency with the other slots.
   // Add ? to all the has-* slots.  --cgay
   // Also, not sure why anyone cares about these three things.
@@ -184,36 +184,29 @@
 
 define method has-named-group?
     (info :: <parse-info>, name :: <string>)
-  block (return)
-    for (group-name in info.group-number-to-name)
-      if (name = group-name)
-        return(#t)
-      end;
-    end;
-    #f
-  end;
+  member?(name, info.group-number-to-name, test: \=)
 end;
 
 define method parse
-    (regexp :: <string>, parse-info :: <parse-info>)
- => (parsed-regexp :: <parsed-regexp>,
+    (regex :: <string>, parse-info :: <parse-info>)
+ => (parsed-regex :: <parsed-regex>,
      last-group :: <integer>,
      backrefs? :: <boolean>,
      alternatives? :: <boolean>, 
      quantifiers? :: <boolean>)
-  let parse-string = make(<parse-string>, string: regexp);
-  let child = parse-regexp(parse-string, parse-info);
-  let parse-tree = make(<regexp>,
-                        pattern: regexp,
+  let parse-string = make(<parse-string>, string: regex);
+  let child = parse-regex(parse-string, parse-info);
+  let parse-tree = make(<regex>,
+                        pattern: regex,
                         group: 0,
                         group-count: parse-info.current-group-number + 1,
                         group-number-to-name: parse-info.group-number-to-name,
 			child: child);
-  let optimized-regexp = optimize(parse-tree);
-  if (optimized-regexp.pathological?)
-    parse-error(regexp, "A subpattern that matches the empty string was quantified.");
+  let optimized-regex = optimize(parse-tree);
+  if (optimized-regex.pathological?)
+    parse-error(regex, "A subpattern that matches the empty string was quantified.");
   else
-    values(optimized-regexp,
+    values(optimized-regex,
 	   parse-info.current-group-number,
 	   parse-info.backreference-used,
 	   parse-info.has-alternatives,
@@ -221,22 +214,22 @@
   end if;
 end method parse;
 
-define method parse-regexp (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+define method parse-regex (s :: <parse-string>, info :: <parse-info>)
+ => parsed-regex :: <parsed-regex>;
   let alternative = parse-alternative(s, info);
   if (~alternative)
     parse-error(s.parse-string, "");
   elseif (lookahead(s) = '|')
     info.has-alternatives := #t;
-    make(<union>, left: alternative, right: parse-regexp(consume(s), info));
+    make(<union>, left: alternative, right: parse-regex(consume(s), info));
   else
     alternative;
   end if;
-end method parse-regexp;
+end method parse-regex;
 
 define method parse-alternative
     (s :: <parse-string>, info :: <parse-info>)
- => (re :: false-or(<parsed-regexp>))
+ => (re :: false-or(<parsed-regex>))
   let term = parse-quantified-atom(s, info);
   if (member?(lookahead(s), #(#f, '|', ')')))
     term;
@@ -246,7 +239,7 @@
 end method parse-alternative;
 
 define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
- => (result :: false-or(<parsed-regexp>))
+ => (result :: false-or(<parsed-regex>))
   // I think this breaks when parse-atom returns #f and then we quantify that.
   // I added some regexes to regression-tests.txt starting with /a()b/ that I
   // hope will exercise that case.  --cgay
@@ -281,7 +274,7 @@
 // {m,n}, {m,}, {,n}, {m}, {}, and {,} are all valid.
 // m defaults to 0 and n defaults to #f (unlimited).
 define method parse-minmax-quantifier
-    (atom :: <parsed-regexp>, s :: <parse-string>) => (qatom :: <quantified-atom>)
+    (atom :: <parsed-regex>, s :: <parse-string>) => (qatom :: <quantified-atom>)
   local method parse-integer () => (int :: false-or(<integer>))
           let digits = make(<deque>);
           while (lookahead(s) & digit?(lookahead(s)))
@@ -308,7 +301,7 @@
 end method parse-minmax-quantifier;
 
 define method parse-atom (s :: <parse-string>, info :: <parse-info>)
- => (regexp :: false-or(<parsed-regexp>))
+ => (regex :: false-or(<parsed-regex>))
   let char = lookahead(s);
   select (char)
     '(' =>
@@ -359,7 +352,7 @@
 //
 define inline function parse-group
     (str :: <parse-string>, info :: <parse-info>)
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
   let char = lookahead(str);
   if (char == '?')
     consume(str);
@@ -373,7 +366,7 @@
 //
 define inline function parse-extended-group
     (str :: <parse-string>, info :: <parse-info>)
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
   let char = lookahead(str);
   consume(str);
   select (char)
@@ -398,18 +391,18 @@
         consume(str);
       end;
       if (~ lookahead(str))
-        parse-error(str.parse-string, "Unterminated (?# comment.");
+        parse-error(str.parse-string, "Unterminated subpattern commend (?#....");
       else
         #f
       end;
 
     otherwise =>
       // See the Python re docs for what all these do.
-      if (member?(char, "iLmsux#=!<("))
-        not-yet-implemented("'(?%c' subpattern construct", lookahead(str));
+      if (member?(char, "iLmsux=!<("))
+        not-yet-implemented("'(?%c' subpattern construct", char);
       else
-        parse-error(str.parse-string, "Invalid (? construct at index %s.",
-                    str.parse-index);
+        parse-error(str.parse-string, "Invalid subpattern construct (?%c...) at index %s.",
+                    char, str.parse-index);
       end;
   end select
 end function parse-extended-group;
@@ -440,18 +433,18 @@
      info :: <parse-info>,
      save-group? :: <boolean>,
      group-name :: false-or(<string>))
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
   if (save-group?)
     info.current-group-number := info.current-group-number + 1;
   end;
-  let regexp = parse-regexp(str, info);
+  let regex = parse-regex(str, info);
   if (lookahead(str) ~== ')')
-    parse-error(str.parse-string, "Unbalanced parens in regexp (index = %s).",
+    parse-error(str.parse-string, "Unbalanced parens in regex (index = %s).",
                 str.parse-index);
   else
     consume(str);
     if (~ save-group?)
-      regexp
+      regex
     else
       if (group-name)
         if (has-named-group?(info, group-name))
@@ -462,7 +455,7 @@
           info.group-number-to-name[info.current-group-number] := group-name;
         end;
       end;
-      make(<mark>, child: regexp, group: info.current-group-number)
+      make(<mark>, child: regex, group: info.current-group-number)
     end
   end
 end function parse-simple-group;
@@ -544,7 +537,7 @@
 //
 define method parse-escaped-character 
     (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+ => parsed-regex :: <parsed-regex>;
   let next-char = lookahead(s);
   if (~next-char)
     parse-error(s.parse-string,
@@ -583,36 +576,36 @@
   end select;
 end method parse-escaped-character;
 
-define method is-anchored? (regexp :: <parsed-regexp>)
+define method is-anchored? (regex :: <parsed-regex>)
  => (result :: <boolean>);
-  select (regexp by instance?)
-    <mark> => is-anchored?(regexp.child);
-    <alternative> => is-anchored?(regexp.left);
-    <parsed-assertion> => regexp.asserts == #"beginning-of-string";
+  select (regex by instance?)
+    <mark> => is-anchored?(regex.child);
+    <alternative> => is-anchored?(regex.left);
+    <parsed-assertion> => regex.asserts == #"beginning-of-string";
     otherwise => #f;
   end select;
 end method is-anchored?;
 
-define method initial-substring (regexp :: <parsed-regexp>)
+define method initial-substring (regex :: <parsed-regex>)
  => (result :: <string>);
   let result = make(<deque>);
-  local method init (regexp :: <parsed-regexp>, result :: <deque>)
-	  select (regexp by instance?)
+  local method init (regex :: <parsed-regex>, result :: <deque>)
+	  select (regex by instance?)
 	    <alternative> =>
-	      init(regexp.left, result) & init(regexp.right, result);
+	      init(regex.left, result) & init(regex.right, result);
 	    <parsed-character> =>
-	      push-last(result, regexp.character);
+	      push-last(result, regex.character);
 	    <parsed-string> =>
-	      for (ch in regexp.string) push-last(result, ch) end for;
+	      for (ch in regex.string) push-last(result, ch) end for;
 	    <mark> =>
-	      init(regexp.child, result);
+	      init(regex.child, result);
 	    <parsed-assertion> =>
 	      #t;
 	    otherwise =>
 	      #f;
 	  end select;
 	end method init;
-  init(regexp, result);
+  init(regex, result);
   as(<byte-string>, result);
 end method initial-substring;
 
@@ -620,17 +613,17 @@
 // Currently the only optimization is merging adjacent characters into
 // a string.
 //
-define method optimize (regexp :: <parsed-regexp>)
- => (regexp :: <parsed-regexp>);
-  select (regexp by instance?)
+define method optimize (regex :: <parsed-regex>)
+ => (regex :: <parsed-regex>);
+  select (regex by instance?)
     <mark> =>
-      regexp.child := optimize(regexp.child);
-      regexp;
+      regex.child := optimize(regex.child);
+      regex;
     <alternative> =>
-      if (instance?(regexp.left, <parsed-character>))
+      if (instance?(regex.left, <parsed-character>))
 	let result-str = make(<deque>);
-	push-last(result-str, regexp.left.character);
-	for (next = regexp.right then next.right,
+	push-last(result-str, regex.left.character);
+	for (next = regex.right then next.right,
 	     while: (instance?(next, <alternative>)
 		       & instance?(next.left, <parsed-character>)))
 	  push-last(result-str, next.left.character)
@@ -639,8 +632,8 @@
 	    push-last(result-str, next.character);
 	    make(<parsed-string>, string: as(<string>, result-str));
 	  elseif (result-str.size = 1)
-	    regexp.right := optimize(regexp.right);
-	    regexp;
+	    regex.right := optimize(regex.right);
+	    regex;
 	  else
 	    make(<alternative>,
 		 left: make(<parsed-string>, string: as(<string>, result-str)),
@@ -648,36 +641,36 @@
 	  end if;
 	end for;
       else
-	regexp.left := optimize(regexp.left);
-	regexp.right := optimize(regexp.right);
-	regexp;
+	regex.left := optimize(regex.left);
+	regex.right := optimize(regex.right);
+	regex;
       end if;
     <union> =>
-      regexp.left := optimize(regexp.left);
-      regexp.right := optimize(regexp.right);
-      regexp;
+      regex.left := optimize(regex.left);
+      regex.right := optimize(regex.right);
+      regex;
     <quantified-atom> =>
-      regexp.atom := optimize(regexp.atom);
-      regexp;
+      regex.atom := optimize(regex.atom);
+      regex;
     otherwise =>
-      regexp;
+      regex;
   end select;
 end method optimize;
 
 // We have to somehow deal with pathological regular expressions like
 // ".**".  Perl simply signals an error in this case.  We *could* in
-// fact match these pathological regexps using the formulation below,
+// fact match these pathological regexs using the formulation below,
 // but it doesn't seem worth the trouble.  Frankly, I doubt anyone has
-// ever tried to use such a pathological regexp and *not* have done it
+// ever tried to use such a pathological regex and *not* have done it
 // by mistake.  But in case I'm wrong, here's how to fix a
-// pathological regexp:
+// pathological regex:
 //
-// First, realize that pathological regexps stem from infinitely
+// First, realize that pathological regexs stem from infinitely
 // quantifying subpatterns that could match the empty string.  So what
 // we do is find this subpattern, and perform the following
 // transformation:
 //
-//  case (type of regexp)
+//  case (type of regex)
 //    r1r2 => r1'r2|r2'
 //    r1|r2 => r1'|r2'
 //    r1{0,n} => r1'{1,n}
@@ -685,90 +678,90 @@
 //    atom => atom
 //    assertion => can't be done
 //
-// This transformation turns a might-match-emptystring regexp into a
-// regexp that matches the same set of strings minus the empty string.
+// This transformation turns a might-match-emptystring regex into a
+// regex that matches the same set of strings minus the empty string.
 // If this transformation can't be done, remember that "$*" is
 // equivalent to "always true and consumes no input".
 
 
-define generic matches-empty-string? (regexp :: <parsed-regexp>)
+define generic matches-empty-string? (regex :: <parsed-regex>)
  => answer :: <boolean>;
 
-define method matches-empty-string? (regexp :: <parsed-atom>)
+define method matches-empty-string? (regex :: <parsed-atom>)
  => answer :: <boolean>;
   #f;
 end method matches-empty-string?;
 
-define method matches-empty-string? (regexp :: <parsed-assertion>)
+define method matches-empty-string? (regex :: <parsed-assertion>)
  => answer :: <boolean>;
   #t;
 end method matches-empty-string?;
 
-define method matches-empty-string? (regexp :: <mark>)
+define method matches-empty-string? (regex :: <mark>)
  => answer :: <boolean>;
-  regexp.child.matches-empty-string?;
+  regex.child.matches-empty-string?;
 end method matches-empty-string?;
 
-define method matches-empty-string? (regexp :: <union>)
+define method matches-empty-string? (regex :: <union>)
  => answer :: <boolean>;
-  regexp.left.matches-empty-string? | regexp.right.matches-empty-string?;
+  regex.left.matches-empty-string? | regex.right.matches-empty-string?;
 end method matches-empty-string?;
 
-define method matches-empty-string? (regexp :: <alternative>)
+define method matches-empty-string? (regex :: <alternative>)
  => answer :: <boolean>;
-  regexp.left.matches-empty-string? & regexp.right.matches-empty-string?;
+  regex.left.matches-empty-string? & regex.right.matches-empty-string?;
 end method matches-empty-string?;
 
-define method matches-empty-string? (regexp :: <quantified-atom>)
+define method matches-empty-string? (regex :: <quantified-atom>)
  => answer :: <boolean>;
-   regexp.min-matches == 0 | regexp.atom.matches-empty-string?;
+   regex.min-matches == 0 | regex.atom.matches-empty-string?;
 end method matches-empty-string?;
 
 
-define generic pathological? (regexp :: <parsed-regexp>)
+define generic pathological? (regex :: <parsed-regex>)
  => answer :: <boolean>;
 
-define method pathological? (regexp :: <parsed-atom>)
+define method pathological? (regex :: <parsed-atom>)
  => answer :: <boolean>;
   #f;
 end method pathological?;
 
-define method pathological? (regexp :: <parsed-assertion>)
+define method pathological? (regex :: <parsed-assertion>)
  => answer :: <boolean>;
   #f;
 end method pathological?;
 
-define method pathological? (regexp :: <mark>)
+define method pathological? (regex :: <mark>)
  => answer :: <boolean>;
-  regexp.child.pathological?;
+  regex.child.pathological?;
 end method pathological?;
 
-define method pathological? (regexp :: <union>)
+define method pathological? (regex :: <union>)
  => answer :: <boolean>;
-  regexp.left.pathological? | regexp.right.pathological?;
+  regex.left.pathological? | regex.right.pathological?;
 end method pathological?;
 
-define method pathological? (regexp :: <alternative>)
+define method pathological? (regex :: <alternative>)
  => answer :: <boolean>;
-  regexp.left.pathological? | regexp.right.pathological?;
+  regex.left.pathological? | regex.right.pathological?;
 end method pathological?;
 
-define method pathological? (regexp :: <quantified-atom>)
+define method pathological? (regex :: <quantified-atom>)
  => answer :: <boolean>;
-  regexp.max-matches == #f & regexp.atom.matches-empty-string?;
+  regex.max-matches == #f & regex.atom.matches-empty-string?;
 end method pathological?;
 
 // Seals for file parse.dylan
 
-// <mark> -- subclass of <parsed-regexp>
+// <mark> -- subclass of <parsed-regex>
 define sealed domain make(singleton(<mark>));
-// <union> -- subclass of <parsed-regexp>
+// <union> -- subclass of <parsed-regex>
 define sealed domain make(singleton(<union>));
-// <alternative> -- subclass of <parsed-regexp>
+// <alternative> -- subclass of <parsed-regex>
 define sealed domain make(singleton(<alternative>));
-// <parsed-assertion> -- subclass of <parsed-regexp>
+// <parsed-assertion> -- subclass of <parsed-regex>
 define sealed domain make(singleton(<parsed-assertion>));
-// <quantified-atom> -- subclass of <parsed-regexp>
+// <quantified-atom> -- subclass of <parsed-regex>
 define sealed domain make(singleton(<quantified-atom>));
 // <parsed-character> -- subclass of <parsed-atom>
 define sealed domain make(singleton(<parsed-character>));

Modified: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- trunk/libraries/regular-expressions/regex.dylan	(original)
+++ trunk/libraries/regular-expressions/regex.dylan	Sun Dec  2 15:16:47 2007
@@ -1,4 +1,4 @@
-Module: regular-expressions-impl
+Module: regular-expressions
 Author: Carl Gay
 Synopsis: A new API for the regular-expressions library
 
@@ -7,7 +7,7 @@
 
 
 
-define constant <invalid-regexp> = <illegal-regexp>;
+define constant <invalid-regex> = <illegal-regex>;
 
 
 // Compile the given string into an optimized regular expression.
@@ -26,77 +26,77 @@
 // @param dot-matches-all -- Normally '.' matches any character except for
 //   newline.  If this parameter is true '.' matches newline as well.
 //
-// This function signals <invalid-regexp> if the regular expression is invalid.
+// This function signals <invalid-regex> if the regular expression is invalid.
 //
-define sealed generic compile-regexp
+define sealed generic compile-regex
     (string :: <string>,
      #key case-sensitive  :: <boolean> = #t,
           verbose         :: <boolean> = #f,
           multi-line      :: <boolean> = #f,
           dot-matches-all :: <boolean> = #f)
- => (regexp :: <regexp>);
+ => (regex :: <regex>);
 
-define method compile-regexp
+define method compile-regex
     (string :: <string>,
      #key case-sensitive  :: <boolean> = #t,
           verbose         :: <boolean> = #f,
           multi-line      :: <boolean> = #f,
           dot-matches-all :: <boolean> = #f)
- => (regexp :: <regexp>)
+ => (regex :: <regex>)
   parse(string,
         make-parse-info(case-sensitive: case-sensitive,
                         verbose: verbose,
                         multi-line: multi-line,
                         dot-matches-all: dot-matches-all))
-end method compile-regexp;
+end method compile-regex;
 
 
-// Returns a <regexp-match> containing info about a successful match, or #f if
+// Returns a <regex-match> containing info about a successful match, or #f if
 // no match was found.
 //
 // @param big -- The string in which to search.
-// @param pattern -- The pattern to search for.  If not a <regexp>, it will be
-//   compiled first with compile-regexp (implying that <invalid-regexp> may be
+// @param pattern -- The pattern to search for.  If not a <regex>, it will be
+//   compiled first with compile-regex (implying that <invalid-regex> may be
 //   signalled), using the defaults for the keyword arguments.  If you wish
-//   to override them, call compile-regexp directly.
+//   to override them, call compile-regex directly.
 // @param anchored -- Whether or not the search should be anchored at the start
 //   position.  This is useful because "^..." will only match at the beginning
-//   of a string, or after \n if the regexp was compiled with multi-line = #t.
+//   of a string, or after \n if the regex was compiled with multi-line = #t.
 // @param start -- Where to begin the search.
 // @param end -- Where to stop searching.
 //
 // todo -- Should $ anchor at the provided end position or at the end of the string?
 //
-define sealed generic regexp-search
+define sealed generic regex-search
     (big :: <string>, pattern :: <object>,
      #key anchored  :: <boolean> = #f,
           start     :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>));
+ => (match :: false-or(<regex-match>));
 
-define method regexp-search
+define method regex-search
     (big :: <string>, pattern :: <string>,
      #key anchored  :: <boolean> = #f,
           start     :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>))
-  regexp-search(big, compile-regexp(pattern),
+ => (match :: false-or(<regex-match>))
+  regex-search(big, compile-regex(pattern),
                 anchored: anchored, start: start, end: _end)
-end method regexp-search;
+end method regex-search;
 
-define method regexp-search
-    (big :: <string>, pattern :: <regexp>,
+define method regex-search
+    (big :: <string>, pattern :: <regex>,
      #key anchored :: <boolean> = #f,
           start    :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>))
-  // Copied from regexp-position with some mods to match our interface.
-  // Unlike regexp-position there is no caching.  If you don't want to
-  // recompile your regexp each time, compile it explicitly with compile-regexp
+ => (match :: false-or(<regex-match>))
+  // Copied from regex-position with some mods to match our interface.
+  // Unlike regex-position there is no caching.  If you don't want to
+  // recompile your regex each time, compile it explicitly with compile-regex
   // and save it.
   let substring = make(<substring>, string: big, start: start, end: _end);
   let case-sensitive? = #t;
-  let num-groups = pattern.regexp-group-count;
+  let num-groups = pattern.regex-group-count;
   let (matched?, marks)
     = if (pattern.is-anchored?)
         anchored-match-root?(pattern, substring, case-sensitive?, num-groups, #f);
@@ -107,7 +107,7 @@
         match-root?(pattern, substring, case-sensitive?, num-groups, searcher);
       end if;
   if (matched?)
-    let regexp-match = make(<regexp-match>, regular-expression: pattern);
+    let regex-match = make(<regex-match>, regular-expression: pattern);
     let group-number-to-name :: <table> = pattern.group-number-to-name;
     for (index from 0 below marks.size by 2)
       let group-number = floor/(index, 2);
@@ -120,35 +120,27 @@
         let text = copy-sequence(substring.entire-string,
                                  start: substring.start-index + bpos,
                                  end: substring.start-index + epos);
-        add-group(regexp-match,
+        add-group(regex-match,
                   make(<match-group>, text: text, start: bpos, end: epos),
                   group-name);
       else
         // This group wasn't matched.
-        add-group(regexp-match, #f, group-name);
+        add-group(regex-match, #f, group-name);
       end;
     end;
-    regexp-match
+    regex-match
   else
     #f
   end
-end method regexp-search;
-
-// This has methods for group :: <string> and group :: <integer>.
-// Group zero is always the entire match.
-define sealed generic regexp-match-group
-    (match :: <regexp-match>, group :: <object>)
- => (text :: false-or(<string>),
-     start-index :: false-or(<integer>),
-     end-index :: false-or(<integer>));
+end method regex-search;
 
 // Get the groups for the match.  There will always be at least one; the entire match.
 //
-define sealed generic regexp-match-groups
-    (match :: <regexp-match>) => (groups :: <sequence>);
+define sealed generic match-groups
+    (match :: <regex-match>) => (groups :: <sequence>);
 
-define method regexp-match-groups
-    (match :: <regexp-match>) => (groups :: <sequence>)
+define method match-groups
+    (match :: <regex-match>) => (groups :: <sequence>)
   map-as(<simple-object-vector>, identity, match.groups-by-position)
 end;
 
@@ -161,20 +153,20 @@
     required-init-keyword: end:;
 end class <match-group>;
 
-define sealed class <regexp-match> (<object>)
+define sealed class <regex-match> (<object>)
   // Groups by position.  Zero is the entire match.
   constant slot groups-by-position :: <stretchy-vector> = make(<stretchy-vector>);
   // Named groups, if any.  Initial size 0 on the assumption that most regular
   // expressions won't use named groups.
   constant slot groups-by-name :: <string-table> = make(<string-table>, size: 0);
-  constant slot regular-expression :: <regexp>, required-init-keyword: regular-expression:;
-end class <regexp-match>;
+  constant slot regular-expression :: <regex>, required-init-keyword: regular-expression:;
+end class <regex-match>;
 
 define method add-group
-    (match :: <regexp-match>,
+    (match :: <regex-match>,
      group :: false-or(<match-group>),
      name :: false-or(<string>))
- => (match :: <regexp-match>)
+ => (match :: <regex-match>)
   add!(match.groups-by-position, group);
   if (name)
     match.groups-by-name[name] := group;
@@ -182,11 +174,19 @@
   match
 end;
 
-define sealed class <invalid-match-group> (<regexp-error>)
+define sealed class <invalid-match-group> (<regex-error>)
 end class <invalid-match-group>;
 
-define method regexp-match-group
-    (match :: <regexp-match>, group-number :: <integer>)
+// This has methods for group :: <string> and group :: <integer>.
+// Group zero is always the entire match.
+define sealed generic match-group
+    (match :: <regex-match>, group :: <object>)
+ => (text :: false-or(<string>),
+     start-index :: false-or(<integer>),
+     end-index :: false-or(<integer>));
+
+define method match-group
+    (match :: <regex-match>, group-number :: <integer>)
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>))
@@ -202,17 +202,17 @@
     signal(make(<invalid-match-group>,
                 format-string: "Group number %d is out of bounds for regex %s match.  %s",
                 format-arguments: list(group-number,
-                                       match.regular-expression.regexp-pattern,
+                                       match.regular-expression.regex-pattern,
                                        if (ng == 1)
                                          "There is only 1 group."
                                        else
                                          format-to-string("There are %d groups.", ng)
                                        end)));
   end;
-end method regexp-match-group;
+end method match-group;
 
-define method regexp-match-group
-    (match :: <regexp-match>, group :: <string>)
+define method match-group
+    (match :: <regex-match>, group :: <string>)
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>))
@@ -224,4 +224,4 @@
                 format-string: "There is no group named %=.",
                 format-arguments: list(group)));
   end
-end method regexp-match-group;
+end method match-group;

Modified: trunk/libraries/regular-expressions/tests/library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/library.dylan	(original)
+++ trunk/libraries/regular-expressions/tests/library.dylan	Sun Dec  2 15:16:47 2007
@@ -6,11 +6,7 @@
     import: {
       streams
     };
-  use regular-expressions,
-    import: {
-      regular-expressions,
-      regexp
-    };
+  use regular-expressions;
   use system,
     import: {
       file-system,
@@ -23,24 +19,12 @@
     regular-expressions-test-suite;
 end library regular-expressions-test-suite;
 
-define module old-api-test-suite
-  use common-dylan,
-    exclude: { 
-      split
-    };
-  use regular-expressions;
-  use testworks;
-  export
-    //pcre-test-suite,
-    old-api-test-suite;
-end module old-api-test-suite;
-
-define module new-api-test-suite
+define module regular-expressions-test-suite
   use common-dylan,
     exclude: {
       split
     };
-  use regexp;
+  use regular-expressions;
   use file-system;
   use locators,
     import: {
@@ -58,14 +42,6 @@
     import: {
       trim
     };
-  export
-    new-api-test-suite;
-end module new-api-test-suite;
-
-define module regular-expressions-test-suite
-  use testworks;
-  use old-api-test-suite;
-  use new-api-test-suite;
   export regular-expressions-test-suite;
 end module regular-expressions-test-suite;
 

Added: trunk/libraries/regular-expressions/tests/pcre.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/tests/pcre.dylan	Sun Dec  2 15:16:47 2007
@@ -0,0 +1,238 @@
+module: regular-expressions-test-suite
+
+//// Tests based on PCRE test output files
+
+// todo -- why does this fail to match anything if it has a leading space???
+define constant $group-regex = compile-regex("[0-9]: (.*)|No match");
+define constant $group-index-of-what-pcre-matched = 1;
+
+define function run-pcre-checks
+    (pathname :: <pathname>) => ()
+  with-open-file(stream = pathname, direction: #"input")
+    let line = #f;
+    let line-number = 0;
+    let lines = make(<stretchy-vector>);
+    while (line := read-line(stream, on-end-of-stream: #f))
+      line-number := line-number + 1;
+      line := trim(line, from: #"left");
+      if (empty?(line))
+        // A section break is always preceded by a group result or "No match".
+        // Some multi-line regular expression patterns have empty lines and we
+        // don't want to think that's the end of the section.
+        if (lines.size > 0
+              & regex-search(lines[lines.size - 1], $group-regex))
+          check-pcre-section(make(<section>,
+                                  lines: lines,
+                                  start-line-number: line-number - lines.size));
+          lines := make(<stretchy-vector>);
+        elseif (lines.size < 3)
+          // A section must have a regex, a test string and one group result to make
+          // any sense.  Note we don't add the line to the section here.
+          test-output("Line %d: empty line in first 3 lines of a section.\n",
+                      line-number);
+        else
+          add!(lines, line);
+        end;
+      else
+        add!(lines, line);
+      end;
+    end while;
+  end;
+end function run-pcre-checks;
+
+define class <section> (<object>)
+  constant slot section-lines :: <sequence>, required-init-keyword: #"lines";
+  constant slot start-line-number :: <integer>, required-init-keyword: #"start-line-number";
+  slot %index :: <integer> = 0;
+end class <section>;
+
+define method consume-line
+    (section :: <section>) => (line :: false-or(<string>))
+  let index = section.%index;
+  let lines = section.section-lines;
+  if (index < lines.size)
+    section.%index := index + 1;
+    lines[index]
+  end
+end method consume-line;
+
+/* Was going to be used for better check names.
+define method line-number
+    (section :: <section>) => (line-number :: <integer>)
+  section.start-line-number + section.%index
+end method line-number;
+*/
+
+define method peek-line
+    (section :: <section>) => (line :: false-or(<string>))
+  if (section.%index < section.section-lines.size)
+    section.section-lines[section.%index]
+  end
+end method peek-line;
+
+/* Example section:
+  /([\da-f:]+)$/i                               // pattern and flags
+      0abc                                      // test string
+   0: 0abc                                      // group 0
+   1: 0abc                                      // group 1
+      0zzz                                      // test string
+  No match                                      // no match
+*/
+define function check-pcre-section
+    (section :: <section>)
+  let regex = parse-pcre-regex(section);
+  // If the section has fewer than 3 lines (a regex, a test string and at least
+  // one group result) then all we do is try to compile it (above).
+  if (section.section-lines.size >= 3)
+    while (peek-line(section))
+      let test-string = consume-line(section);
+      //test-output("  test string: %s\n", test-string);
+      let group-strings = make(<stretchy-vector>);
+      block (done-with-this-test-string)
+        while (#t)
+          let line = peek-line(section);
+          let match = line & regex-search(line, $group-regex);
+          if (match)
+            consume-line(section);
+            let group-text = match-group(match, $group-index-of-what-pcre-matched);
+            //test-output("   pcre group: %s\n", group-text | "No match");
+            if (group-text)
+              add!(group-strings, group-text);
+            else
+              assert(match-group(match, 0) = "No match",
+                     "previous line was 'No match'");
+              done-with-this-test-string();
+            end;
+          else
+            done-with-this-test-string();
+          end;
+        end;
+      end;
+      if (regex)
+        check-no-errors(format-to-string("search for %s in %s",
+                                         test-string, regex.regex-pattern),
+                        regex-search(test-string, regex));
+        let match = block ()
+                      regex-search(test-string, regex)
+                    exception (ex :: <error>)
+                      #f
+                    end;
+        if (match)
+          compare-to-pcre-results(regex.regex-pattern, test-string, match, group-strings);
+        end;
+      end if;
+    end while;
+  end if;
+end function check-pcre-section;
+
+define function parse-pcre-regex
+    (section :: <section>)
+ => (regex :: false-or(<regex>))
+  local method find-last (string, char)
+          // position(string, char, from-end: #t)
+          block (break)
+            for (i from string.size - 1 to 0 by -1)
+              if (string[i] == char)
+                break(i);
+              end;
+            end;
+          end;
+        end method find-last;
+  // This is imprecise.  If the pattern spans multiple lines this will fail
+  // if any but the first and last lines contain the regex delimiter character.
+  // Might be nice to add a function to the regular-expressions module to read
+  // a perl regexp from a stream.
+  local method read-pattern-and-flags ()
+          let pnf = consume-line(section);
+          let delim = pnf[0];
+          if (find-last(pnf, delim) == 0)
+            // it's a multi-line regex
+            while(peek-line(section) & ~find-last(peek-line(section), delim))
+              pnf := concatenate(pnf, "\n", consume-line(section));
+            end;
+            if (peek-line(section))
+              pnf := concatenate(pnf, "\n", consume-line(section));
+            end;
+          end;
+          let end-delim = find-last(pnf, delim);
+          let flags = copy-sequence(pnf, start: end-delim + 1);
+          let pattern = copy-sequence(pnf, start: 1, end: end-delim);
+          values(pattern, flags)
+        end method read-pattern-and-flags;
+  let (pattern, flags) = read-pattern-and-flags();
+  //test-output("pattern: %s (flags = %s)\n", pattern, flags);
+  for (flag in flags)
+    check-true(format-to-string("For regex %s, flag %s is recognized",
+                                pattern, flag),
+               member?(flag, "ixms"));
+  end for;
+  block ()
+    compile-regex(pattern,
+                  case-sensitive: ~ member?('i', flags),
+                  verbose: member?('x', flags),
+                  multi-line: member?('m', flags),
+                  dot-matches-all: member?('s', flags))
+  // Unfortunately we can't catch <regex-error> here because the charset
+  // parser is in string-extensions and signals <invalid-character-set-description>
+  // which isn't related to <regex-error> (and isn't even exported).
+  exception (ex :: <error>)
+    check-true(format-to-string("can compile regex %s", pattern), #f);
+    //test-output("  ERROR: %s\n", ex);
+    #f
+  end block
+end function parse-pcre-regex;
+  
+/*
+ * pcre-groups is a sequence of strings where the nth element represents
+ * the nth group in the pcre regex match.  If pcre-groups is empty then
+ * there was no match.
+ */
+define function compare-to-pcre-results
+    (pattern :: <string>,
+     test-string :: <string>,
+     match :: false-or(<regex-match>),
+     pcre-groups :: <sequence>)
+ => ()
+  if (match)
+    check-equal(format-to-string("Match %s against %s -- same # of groups",
+                                 test-string, pattern),
+                size(match-groups(match)),
+                pcre-groups.size);
+    for (group-number from 0,
+         pcre-group in pcre-groups)
+      // Adding block/exception here causes an infinite loop.
+      // Could it be related to using the Visual Studio 8 linker?
+      // The if also causes an infinite loop.  Hmmm.
+      let our-group = /* if (group-number < size(match-groups(match))) */
+                        match-group(match, group-number)
+                      /* end */;
+      check-equal(format-to-string("Match %s against %s -- group %d is the same",
+                                   test-string, pattern, group-number),
+                  our-group,
+                  pcre-group);
+    end;
+  else
+    check-equal(format-to-string("Pattern %s doesn't match test string %s",
+                                 pattern, test-string),
+                0,
+                pcre-groups.size);
+  end if;
+end function compare-to-pcre-results;
+
+define function make-pcre-locator
+    (filename :: <string>) => (locator :: <file-locator>)
+  let source-directory = environment-variable("OPEN_DYLAN_USER_SOURCES");
+  if (source-directory)
+    let dir = subdirectory-locator(as(<directory-locator>, source-directory),
+                                   "libraries",
+                                   "regular-expressions",
+                                   "tests");
+    make(<file-locator>, directory: dir, name: filename)
+  else
+    signal(make(<simple-error>,
+                format-string: "pcre-test requires the OPEN_DYLAN_USER_SOURCES environment "
+                               "variable to be set to the root of your Dylan sources."));
+  end
+end function make-pcre-locator;
+
+

Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	(original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	Sun Dec  2 15:16:47 2007
@@ -1,8 +1,84 @@
 Module: regular-expressions-test-suite
 Author: Carl Gay
 
+define function re/position (string, pattern, #rest args)
+  let (#rest marks) = apply(regex-position, string, pattern, args);
+  marks
+end function re/position;
+
+define test atom-test ()
+  // In current code the empty string is an illegal regex, but Python
+  // (and probably perl?) allow it, so I think we should consider that
+  // a bug.  --cgay
+  check-no-errors("atom-0", re/position("", ""));
+  check-equal("atom-1", re/position("a", "a"),      #[0, 1]);
+  check-equal("atom-2", re/position("a", "[a]"),    #[0, 1]);
+  check-equal("atom-3", re/position("ab", "(a)b"),  #[0, 2, 0, 1]);
+  check-equal("atom-4", re/position("a", "\\w"),    #[0, 1]);
+  check-equal("atom-5", re/position("a", "."),      #[0, 1]);
+  check-equal("atom-6", re/position("a", "a{0}"),   #[0, 0]);
+  check-equal("atom-7", re/position("aa", "a{2}"),  #[0, 2]);
+  check-equal("atom-8", re/position("aa", "a{1,}"), #[0, 2]);
+  check-equal("atom-9", re/position("aaa", "a{1,8}"), #[0, 3]);
+  check-equal("atom-A", re/position("", "a{,}"),   #[0, 0]);
+  check-equal("atom-A1", re/position("aaaaaa", "a{,}"),   #[0, 6]);
+  check-condition("atom-B", <invalid-regex>, re/position("", "a{m,n}"));
+  check-condition("atom-C", <invalid-regex>, re/position("", "a{m,}"));
+  check-condition("atom-D", <invalid-regex>, re/position("", "a{,n}"));
+  check-condition("atom-E", <invalid-regex>, re/position("", "a{m}"));
+  check-condition("atom-F", <invalid-regex>, re/position("", "a{,"));
+  check-condition("atom-G", <invalid-regex>, re/position("", "[a"));
+  check-condition("atom-H", <invalid-regex>, re/position("", "\\"));
+  //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
+end;
+
+// These should all compile.
+//
+define test good-regex-test ()
+  let patterns = #(
+    "",
+    "a()b",
+    "a(?#blah)b"
+    );
+  for (pattern in patterns)
+    check-no-errors(format-to-string("Regex '%s' compiles", pattern),
+                    compile-regex(pattern));
+  end;
+end test good-regex-test;
+
+// All these regexes should signal <invalid-regex> on compilation.
+//
+define test bad-regex-test ()
+  let patterns = #(
+    "(?P<name>x)(?P<name>y)",         // can't use same name twice
+    "(?@abc)"                         // invalid extended character '@'
+    );
+  for (pattern in patterns)
+    check-condition(format-to-string("Compiling '%s' gets an error", pattern),
+                    <invalid-regex>,
+                    compile-regex(pattern));
+  end;
+end test bad-regex-test;
+
+define test pcre-testoutput1 ()
+  run-pcre-checks(make-pcre-locator("pcre-testoutput1.txt"));
+end;
+
+define suite pcre-test-suite ()
+  test pcre-testoutput1;
+end;
+
+define test regressions-test ()
+  run-pcre-checks(make-pcre-locator("regression-tests.txt"));
+end;
 
 define suite regular-expressions-test-suite ()
-  suite old-api-test-suite;
-  suite new-api-test-suite;
+  test atom-test;
+  test good-regex-test;
+  test bad-regex-test;
+  test regressions-test;
+  // For now leaving out the PCRE tests because they're too verbose.
+  // Once more basic stuff is working will start including them again.
+  // suite pcre-test-suite;
 end;
+

Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid	(original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid	Sun Dec  2 15:16:47 2007
@@ -1,5 +1,4 @@
 library: regular-expressions-test-suite
 files: library
+       pcre
        regular-expressions-test-suite
-       old-api-test-suite
-       new-api-test-suite



More information about the chatter mailing list