[Gd-chatter] r11414 - in trunk/libraries/regular-expressions: . tests

cgay at gwydiondylan.org cgay at gwydiondylan.org
Mon Jun 18 03:27:22 CEST 2007


Author: cgay
Date: Mon Jun 18 03:27:19 2007
New Revision: 11414

Added:
   trunk/libraries/regular-expressions/regex.dylan   (contents, props changed)
Modified:
   trunk/libraries/regular-expressions/gd-library.dylan
   trunk/libraries/regular-expressions/gd-regular-expressions.lid
   trunk/libraries/regular-expressions/interface.dylan
   trunk/libraries/regular-expressions/match.dylan
   trunk/libraries/regular-expressions/od-library.dylan
   trunk/libraries/regular-expressions/od-regular-expressions.lid
   trunk/libraries/regular-expressions/parse.dylan
   trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
Log:
job: 7357
* Added a new API module, "regex".
* Improve error messages a bit.
* Parsing "\\" (unterminated escape) and "[..." now signal <invalid-regex>
* Parsing "" now signals <invalid-regex>.  Python allows this, so I left
  a failing test for it in the test suite.
* Improved min/max quantifier parsing a bit.  {m,n}, {m,}, {,n}, {m}, {},
  and {,} are all valid now.

Modified: trunk/libraries/regular-expressions/gd-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/gd-library.dylan	(original)
+++ trunk/libraries/regular-expressions/gd-library.dylan	Mon Jun 18 03:27:19 2007
@@ -32,32 +32,57 @@
 //======================================================================
 
 
+// Added regex module with new API.  --cgay, June 2007
+
 define library regular-expressions
-  use dylan;
-  use collection-extensions;
+  use common-dylan;
   use string-extensions;
   use table-extensions;
   export
-    regular-expressions;
+    regex,                                             // new API
+    regular-expressions;                               // old API
 end library regular-expressions;
 
-define module regular-expressions
-  use dylan;
-  use extensions;
-  use string-conversions;
-  use character-type;
-  use string-hacking;
-  use subseq;
-  use %do-replacement;
-  use %parse-string;
-  use substring-search;
-  use table-extensions, import: { string-hash };
-  export
+define module regex                  // new API module
+  create
+    compile-regex,
+    regex-search,
+    <regex>,
+    <invalid-regex>,
+      invalid-regex-pattern,
+    <regex-match>,                   // results of a successful search
+      regex-match-group,
+      regex-match-group-count,
+      group-start,
+      group-end,
+      group-text,
+      <invalid-match-group>;
+end module regex;
+
+define module regular-expressions    // old API module
+  create
     regexp-position, make-regexp-positioner,
+    regexp-match,
     regexp-replace, make-regexp-replacer,
-    regexp-match, regexp-matches,
     translate, make-translator,
     split, make-splitter,
     join,
-    <illegal-regexp>;
+    <illegal-regexp>,
+      regexp-pattern;
+
+  create
+    split-string;
 end module regular-expressions;
+
+define module regular-expressions-impl
+  use common-dylan,
+    exclude: { split };
+  use string-conversions;
+  use character-type;
+  use string-hacking;
+  use %do-replacement;
+  use %parse-string;
+  use substring-search;
+  use regular-expressions;                             // API module
+  use regex;
+end module regular-expressions-impl;

Modified: trunk/libraries/regular-expressions/gd-regular-expressions.lid
==============================================================================
--- trunk/libraries/regular-expressions/gd-regular-expressions.lid	(original)
+++ trunk/libraries/regular-expressions/gd-regular-expressions.lid	Mon Jun 18 03:27:19 2007
@@ -4,3 +4,4 @@
        match
        parse
        interface
+       regex

Modified: trunk/libraries/regular-expressions/interface.dylan
==============================================================================
--- trunk/libraries/regular-expressions/interface.dylan	(original)
+++ trunk/libraries/regular-expressions/interface.dylan	Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module:   regular-expressions
+module:   regular-expressions-impl
 author:   Nick Kramer (nkramer at cs.cmu.edu)
 synopsis: This provides a useable interface for users. Functions 
 	  defined outside this file are really too strange and quirky 
@@ -58,6 +58,11 @@
 // parse creating a <character-set>, which must be either case
 // sensitive or case insensitive)
 //
+
+// This caching scheme fails if we later introduce the ability to change
+// attributes such as case-sensitivity mid-parse, the way (I believe) perl
+// does?  --cgay
+
 // ### Currently, only regexp-position uses this cache, because the
 // other functions are still using make-regexp-positioner.  With
 // caching, that make-regexp-whatever stuff should probably go.
@@ -101,23 +106,23 @@
  => (equal? :: <function>, hash :: <function>);
   values(method (key1 :: <cache-key>, key2 :: <cache-key>) // equal?
 	  => res :: <boolean>;
-	   key1.regexp-string = key2.regexp-string
+	   key1.regexp-string == key2.regexp-string
 	     & key1.character-set-type == key2.character-set-type;
 	 end method,
-	 method (key :: <cache-key>, initial-state)
-	  => (id :: <integer>, state); // hash()
-	   let (string-id, string-state)
-	     = string-hash(key.regexp-string, initial-state);
+	 method (key :: <cache-key>, initial-state) => (id :: <integer>, state); // hash()
+	   let (string-id, string-state) = object-hash(key.regexp-string, initial-state);
 	   let (set-type-id, set-type-state) 
 	     = object-hash(key.character-set-type, string-state);
-	   let id = merge-hash-ids(string-id, set-type-id, ordered: #t);
-	   values(id, set-type-state);
+	   values(merge-hash-ids(string-id, set-type-id, ordered: #t), set-type-state);
 	 end method);
 end method table-protocol;
 
 // *regexp-cache* -- internal
 //
 // The only instance of <regexp-cache>.  ### Not threadsafe.
+// 
+// Technically not thread safe, but does it matter?  Worst case seems to
+// be a duplicated regexp parse.  --cgay
 //
 define constant *regexp-cache* = make(<regexp-cache>);
 
@@ -127,16 +132,17 @@
 // parses it and adds it to the cache.
 //
 define inline function parse-or-use-cached 
-    (regexp :: <string>, character-set-type :: <class>) 
+    (regexp :: <string>, parse-info :: <parse-info>)
  => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>);
   let key = make(<cache-key>, regexp-string: regexp, 
-		 character-set-type: character-set-type); 
+		 character-set-type: parse-info.set-type); 
   let cached-value = element(*regexp-cache*, key, default: #f);
   if (cached-value)
     values(cached-value.parse-tree, cached-value.last-group);
   else
-    let (parsed-regexp, last-group) = parse(regexp, character-set-type);
-    *regexp-cache*[key] := make(<cache-element>, parse-tree: parsed-regexp,
+    let (parsed-regexp, last-group) = parse(regexp, parse-info);
+    *regexp-cache*[key] := make(<cache-element>,
+                                parse-tree: parsed-regexp,
 				last-group: last-group);
     values(parsed-regexp, last-group);
   end if;
@@ -155,13 +161,8 @@
  => (regexp-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
   let substring = make(<substring>, string: big, start: big-start,
 		       end: big-end | big.size);
-  let char-set-class = if (case-sensitive) 
-			 <case-sensitive-character-set>;
-		       else
-			 <case-insensitive-character-set>;
-		       end if;
   let (parsed-regexp, last-group) 
-    = parse-or-use-cached(regexp, char-set-class);
+    = parse-or-use-cached(regexp, make-parse-info(case-sensitive: case-sensitive));
 
   let (matched, marks)
     = if (parsed-regexp.is-anchored?)
@@ -220,6 +221,8 @@
   apply(values, result)
 end;
 
+// #if (have-free-time)
+/*
 // regexp-matches -- exported
 //
 // A more convenient form of regexp-position.  Usually you want
@@ -232,28 +235,47 @@
     (big :: <string>, regexp :: <string>,
      #key start: start-index :: <integer> = 0,
           end: end-index :: false-or(<integer>),
-          case-sensitive :: <boolean> = #f)
-
-  let (regexp-start, lemon, #rest marks)
+          case-sensitive :: <boolean> = #f,
+          groups :: false-or(<sequence>))
+ => (#rest group-strings :: false-or(<string>));
+  if (~groups)
+    error("Mandatory keyword groups: not used in call to regexp-matches");
+  end if;
+  let (#rest marks)
     = regexp-position(big, regexp, start: start-index, end: end-index, 
 		      case-sensitive: case-sensitive);
-
-  let return-size = floor/(marks.size, 2);
-  let return = make(<vector>, size: return-size, fill: #f);
-  if (regexp-start)
-    // all groups associate by index in the result
-
-    for (index from 0 below return-size)
-      let pos = index * 2;
-      if (element(marks, pos, default: #f))
-        // "14 0", "4 5", "7 8", "9 10" for "this is a test"
-        //return[index] := concatenate(integer-to-string(marks[pos]), concatenate(" ", integer-to-string(marks[pos + 1])));
-        return[index] := copy-sequence(big, start: marks[pos], end: marks[pos + 1]);
-      end if;
+  let return-val = make(<vector>, size: groups.size, fill: #f);
+  for (index from 0 below return-val.size)
+    let group-start = groups[index] * 2;
+    let group-end = group-start + 1;
+    if (element(marks, group-start, default: #f))
+      return-val[index] := copy-sequence(big, start: 
+
+  let sz = floor/(marks.size, 2);
+  let return = make(<vector>, size: sz, fill: #f);
+  for (index from 0 below sz)
+    let pos = index * 2;
+    if (element(marks, pos, default: #f))
+      return[index] := copy-sequence(big, start: marks[pos],
+				     end: marks[pos + 1]);
+    end if;
+  end for;
+  if (matches)
+    let return = make(<vector>, size: matches.size * 2);
+    for (raw-pos in matches, index from 0)
+      let src-pos = raw-pos * 2;
+      let dest-pos = index * 2;
+      return[dest-pos] := element(marks, src-pos, default: #f);
+      return[dest-pos + 1] := element(marks, src-pos + 1, default: #f);
     end for;
+    apply(values, return);
+  else
+    
+    apply(values, marks);
   end if;
-  apply(values, return);
-end function;
+
+// #endif
+*/
 
 
 // Functions based on regexp-position
@@ -446,7 +468,7 @@
   end method;
 end function make-splitter;
 
-// Used by split.  Not exported.
+// Used by split.  Not exported.  (Yes it is.  --cgay)
 //
 define function split-string
     (positioner :: <function>, input :: <string>, start :: <integer>, 

Modified: trunk/libraries/regular-expressions/match.dylan
==============================================================================
--- trunk/libraries/regular-expressions/match.dylan	(original)
+++ trunk/libraries/regular-expressions/match.dylan	Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module:   regular-expressions
+module:   regular-expressions-impl
 author:   Nick Kramer (nkramer at cs.cmu.edu)
 synopsis: This takes a parsed regular expression and tries to find a match
           for it.

Modified: trunk/libraries/regular-expressions/od-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/od-library.dylan	(original)
+++ trunk/libraries/regular-expressions/od-library.dylan	Mon Jun 18 03:27:19 2007
@@ -32,33 +32,56 @@
 //======================================================================
 
 
+// Added regex module with new API.  --cgay, June 2007
+
 define library regular-expressions
-  use functional-dylan;
+  use common-dylan;
   use string-extensions;
   export
-    regular-expressions;
+    regex,                                             // new API
+    regular-expressions;                               // old API
 end library regular-expressions;
 
-define module regular-expressions
-  use functional-dylan, exclude: { split };
-  use dylan-extensions, import: { string-hash };
-  // use extensions;
-  use string-conversions;
-  use character-type;
-  use string-hacking;
-  // use subseq;
-  use %do-replacement;
-  use %parse-string;
-  use substring-search;
-  export
+define module regex                  // new API module
+  create
+    compile-regex,
+    regex-search,
+    <regex>,
+    <invalid-regex>,
+      invalid-regex-pattern,
+    <regex-match>,                   // results of a successful search
+      regex-match-group,
+      regex-match-group-count,
+      group-start,
+      group-end,
+      group-text,
+      <invalid-match-group>;
+end module regex;
+
+define module regular-expressions    // old API module
+  create
     regexp-position, make-regexp-positioner,
+    regexp-match,
     regexp-replace, make-regexp-replacer,
-    regexp-match, regexp-matches,
     translate, make-translator,
     split, make-splitter,
     join,
-    <illegal-regexp>;
+    <illegal-regexp>,
+      regexp-pattern;
 
-  export
+  create
     split-string;
 end module regular-expressions;
+
+define module regular-expressions-impl
+  use common-dylan,
+    exclude: { split };
+  use string-conversions;
+  use character-type;
+  use string-hacking;
+  use %do-replacement;
+  use %parse-string;
+  use substring-search;
+  use regular-expressions;                             // API module
+  use regex;
+end module regular-expressions-impl;

Modified: trunk/libraries/regular-expressions/od-regular-expressions.lid
==============================================================================
--- trunk/libraries/regular-expressions/od-regular-expressions.lid	(original)
+++ trunk/libraries/regular-expressions/od-regular-expressions.lid	Mon Jun 18 03:27:19 2007
@@ -3,4 +3,4 @@
  match.dylan
  parse.dylan
  interface.dylan
-
+ regex.dylan

Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan	(original)
+++ trunk/libraries/regular-expressions/parse.dylan	Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module: regular-expressions
+module: regular-expressions-impl
 author: Nick Kramer (nkramer at cs.cmu.edu)
 copyright: see below
 
@@ -98,11 +98,34 @@
   constant slot group-number :: <integer>, required-init-keyword: #"group"; 
 end class <parsed-backreference>;
 
+// Note: I'm pretty sure <simple-error> won't work in GD.  --cgay
+define class <regex-error> (<simple-error>)
+end class <regex-error>;
+
+define class <illegal-regexp> (<regex-error>)
+  constant slot regexp-pattern :: <string>, 
+    required-init-keyword: #"pattern";
+end class <illegal-regexp>;
+
+define sealed domain make (singleton(<illegal-regexp>));
+define sealed domain initialize (<illegal-regexp>);
+
+define function parse-error
+    (pattern :: <string>, format-string :: <string>, #rest format-args)
+  let msg = apply(format-to-string, format-string, format-args);
+  signal(make(<illegal-regexp>,
+              format-string: "Invalid regular expression: %=.  %s",
+              format-arguments: list(pattern, msg),
+              pattern: pattern));
+end function parse-error;
+
 // <parse-info> contains some information about the current regexp
 // being parsed.  Using a structure is slightly nicer than having
 // global variables..
 //
 define class <parse-info> (<object>)
+  // Name this has-backreferences, for consistency with the other slots?
+  // Add ? to all the has-* slots.  --cgay
   slot backreference-used :: <boolean>, init-value: #f;
      // Whether or not the function includes \1, \2, etc in the regexp.
      // This is different from return-marks, which determines whether the
@@ -113,36 +136,42 @@
   constant slot set-type :: <class>, required-init-keyword: #"set-type";
 end class <parse-info>;
 
-define class <illegal-regexp> (<error>)
-  constant slot regular-expression :: <string>, 
-    required-init-keyword: #"regexp";
-end class <illegal-regexp>;
-
-define sealed domain make (singleton(<illegal-regexp>));
-define sealed domain initialize (<illegal-regexp>);
-
-// cgay todo
-/* KJP: Doesn't work this way in Functional Developer.
-define sealed method report-condition (cond :: <illegal-regexp>, stream) => ();
-  condition-format(stream, "Illegal regular expression: \n"
-		     "A sub-regexp that matches the empty string has"
-		     " been quantified in\n   %s",
-		   cond.regular-expression);
-end method report-condition;
-*/
-//ignorable(regular-expression);
-
-define method parse (regexp :: <string>, character-set-type :: <class>)
- => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>,
-     backrefs? :: <boolean>, alternatives? :: <boolean>, 
-     quantifiers? :: <boolean>);
-  let parse-info = make(<parse-info>, set-type: character-set-type);
+define function make-parse-info
+    (#key case-sensitive  :: <boolean> = #t,
+          verbose         :: <boolean> = #f,
+          multi-line      :: <boolean> = #f,
+          dot-matches-all :: <boolean> = #f)
+ => (info :: <parse-info>)
+  local method nyi (option-name)
+          signal(make(<regex-error>,
+                      format-string: "The '%s' option is not yet implemented.",
+                      format-arguments: list(option-name)));
+        end;
+  verbose & nyi("verbose");
+  multi-line & nyi("multi-line");
+  dot-matches-all & nyi("dot-matches-all");
+  let char-set-type
+   = if (case-sensitive)
+       <case-sensitive-character-set>
+     else
+       <case-insensitive-character-set>
+     end;
+  make(<parse-info>, set-type: char-set-type)
+end function make-parse-info;
+
+define method parse
+    (regexp :: <string>, parse-info :: <parse-info>)
+ => (parsed-regexp :: <parsed-regexp>,
+     last-group :: <integer>,
+     backrefs? :: <boolean>,
+     alternatives? :: <boolean>, 
+     quantifiers? :: <boolean>)
   let parse-string = make(<parse-string>, string: regexp);
   let parse-tree = make(<mark>, group: 0, 
 			child: parse-regexp(parse-string, parse-info));
   let optimized-regexp = optimize(parse-tree);
   if (optimized-regexp.pathological?)
-    signal(make(<illegal-regexp>, regexp: regexp));
+    parse-error(regexp, "A sub-regexp that matches the empty string was quantified.");
   else
     values(optimized-regexp,
 	   parse-info.current-group-number,
@@ -155,7 +184,9 @@
 define method parse-regexp (s :: <parse-string>, info :: <parse-info>)
  => parsed-regexp :: <parsed-regexp>;
   let alternative = parse-alternative(s, info);
-  if (lookahead(s) = '|')
+  if (~alternative)
+    parse-error(s.parse-string, "");
+  elseif (lookahead(s) = '|')
     info.has-alternatives := #t;
     make(<union>, left: alternative, right: parse-regexp(consume(s), info));
   else
@@ -163,8 +194,9 @@
   end if;
 end method parse-regexp;
 
-define method parse-alternative (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+define method parse-alternative
+    (s :: <parse-string>, info :: <parse-info>)
+ => (re :: false-or(<parsed-regexp>))
   let term = parse-quantified-atom(s, info);
   if (member?(lookahead(s), #(#f, '|', ')')))
     term;
@@ -174,7 +206,7 @@
 end method parse-alternative;
 
 define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: false-or(<parsed-regexp>);
+ => (result :: false-or(<parsed-regexp>))
   let atom = parse-atom(s, info);
   let char = lookahead(s);
   select (char by \=)
@@ -196,53 +228,52 @@
     '{' =>
       info.has-quantifiers := #t;
       consume(s);
-      let first-string = make(<deque>);
-      let second-string = make(<deque>);
-      let has-comma = #f;
-      for (c = lookahead(s) then lookahead(s), until: c = '}')
-	consume(s);
-	if (c = ',')  
-	  has-comma := #t;
-	elseif (has-comma)  
-	  push-last(second-string, c);
-	else 
-	  push-last(first-string, c);
-	end if;
-      end for;
-      consume(s);         // Eat closing brace
-      let first-num = string-to-integer(as(<byte-string>, first-string));
-      make(<quantified-atom>, atom: atom, 
-	   min: first-num,
-	   max:  if (~has-comma)    
-		   first-num
-		 elseif (empty?(second-string))   
-		   #f
-		 else
-		   string-to-integer(as(<byte-string>, second-string))
-		 end if);
+      parse-minmax-quantifier(atom, s);
 
     otherwise =>
       atom;
   end select;
 end method parse-quantified-atom;
 
+// {m,n}, {m,}, {,n}, {m}, {}, and {,} are all valid.
+// m defaults to 0 and n defaults to #f (unlimited).
+define method parse-minmax-quantifier
+    (atom :: <parsed-regexp>, s :: <parse-string>) => (qatom :: <quantified-atom>)
+  local method parse-integer () => (int :: false-or(<integer>))
+          let digits = make(<deque>);
+          while (lookahead(s) & digit?(lookahead(s)))
+            push-last(digits, lookahead(s));
+            consume(s);
+          end;
+          ~empty?(digits) & string-to-integer(as(<byte-string>, digits));
+        end method parse-integer;
+  let qmin = parse-integer() | 0;
+  let qmax = #f;
+  if (lookahead(s) = ',')
+    consume(s);
+    qmax := parse-integer();
+  else
+    qmax := qmin;
+  end;
+  if (lookahead(s) ~= '}')
+    parse-error(s.parse-string,
+                "Close brace expected in {m,n} quantifier (index = %s).",
+                s.parse-index);
+  end;
+  consume(s);
+  make(<quantified-atom>, atom: atom, min: qmin, max: qmax);
+end method parse-minmax-quantifier;
+
 define method parse-atom (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: false-or(<parsed-regexp>);
+ => (regexp :: false-or(<parsed-regexp>))
   let char = lookahead(s);
   select (char)
     '(' =>
       consume(s);   // Consume beginning paren
-      info.current-group-number := info.current-group-number + 1;
-      let this-group = info.current-group-number;
-      let regexp = parse-regexp(s, info);
-      if (lookahead(s) ~= ')')
-	error("Unbalanced parens in regexp");
-      end if;
-      consume(s);   // Consume end paren
-      make(<mark>, child: regexp, group: this-group);
+      parse-group(s, info);
 
     ')' =>
-      #f;             // Need something to terminate upon seeing a close paren
+      #f;              // Need something to terminate upon seeing a close paren
 
     #f  =>
       #f;   // Signal error?  (end of stream)
@@ -252,29 +283,13 @@
 
     '\\' =>
       consume(s);        // Consume the backslash
+      // Perhaps add support for a different escape character to aid readability.
+      // The escape character could be specified in the <parse-info>.  --cgay
       parse-escaped-character(s, info);
 
     '[' =>
       consume(s);        // Eat the opening brace
-      let set-string = make(<deque>);      // Need something that'll 
-                                           // preserve the right ordering
-      for (char = lookahead(s) then lookahead(s), until: char == ']')
-	consume(s);                    // eat char
-	if (char ~== '\\')
-	  push-last(set-string, char);
-	else
-	  let char2 = lookahead(s);
-	  consume(s);  // Eat escaped char
-	  if (char2 == ']')
-	    push-last(set-string, ']');
-	  else
-	    push-last(set-string, '\\');
-	    push-last(set-string, char2);
-	  end if;
-	end if;
-      end for;
-      consume(s);     // Eat ending brace
-      make(<parsed-set>, set: make(info.set-type, description: set-string));
+      parse-character-set(s, info);
 
     '.' =>
       consume(s);
@@ -297,10 +312,57 @@
   end select;
 end method parse-atom;
 
+define inline function parse-group
+    (s :: <parse-string>, info :: <parse-info>)
+ => (mark :: <mark>)
+  info.current-group-number := info.current-group-number + 1;
+  let this-group = info.current-group-number;
+  let regexp = parse-regexp(s, info);
+  if (lookahead(s) = ')')
+    consume(s);   // Consume ')'
+    make(<mark>, child: regexp, group: this-group)
+  else
+    parse-error(s.parse-string, "Unbalanced parens in regexp (index = %s).",
+                s.parse-index);
+  end;
+end function parse-group;
+
+// This just does a quick scan to find the closing ] and then lets
+// make(<character-set>) do the real parsing.
+//
+define inline function parse-character-set
+    (s :: <parse-string>, info :: <parse-info>)
+ => (set :: <parsed-set>)
+  let set-string = make(<deque>);      // Need something that'll 
+                                       // preserve the right ordering
+  let start-index = s.parse-index;
+  for (char = lookahead(s) then lookahead(s),
+       until: char == ']')
+    consume(s);                    // eat char
+    if (~char)
+      parse-error(s.parse-string,
+                  "Unterminated character set at index %d.",start-index);
+    elseif (char ~== '\\')
+      push-last(set-string, char);
+    else
+      let char2 = lookahead(s);
+      consume(s);  // Eat escaped char
+      if (char2 == ']')
+        push-last(set-string, ']');
+      else
+        push-last(set-string, '\\');
+        push-last(set-string, char2);
+      end if;
+    end if;
+  end for;
+  consume(s);     // Eat ending brace
+  make(<parsed-set>, set: make(info.set-type, description: set-string));
+end function parse-character-set;
+
 define constant any-char 
   = make(<case-sensitive-character-set>, description: "^\n");
 
-// The useful definitions of all these is in as(<character-set>)
+// The useful definitions of all these are in as(<character-set>).
 //
 define constant digit-chars
   = make(<case-sensitive-character-set>, description: "\\d");
@@ -328,12 +390,20 @@
     (s :: <parse-string>, info :: <parse-info>)
  => parsed-regexp :: <parsed-regexp>;
   let next-char = lookahead(s);
+  if (~next-char)
+    parse-error(s.parse-string,
+                "Unterminated escape sequence at index %d.",
+                s.parse-index - 1)
+  end;
   consume(s);
   select (next-char)
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' =>
       info.backreference-used := #t;
       make(<parsed-backreference>, group: digit-to-integer(next-char));
-
+      
+    // Hmm.  Why would you write \\n in your regex instead of \n?  It has the
+    // same effect.  Also, what about the rest of the Dylan character escapes?
+    // --cgay
     'n' =>   make(<parsed-character>, character: '\n');   // Newline
     't' =>   make(<parsed-character>, character: '\t');   // Tab
     'f' =>   make(<parsed-character>, character: '\f');   // Formfeed

Added: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/regex.dylan	Mon Jun 18 03:27:19 2007
@@ -0,0 +1,199 @@
+Module: regular-expressions-impl
+Author: Carl Gay
+Synopsis: A new API for the regular-expressions library
+
+
+// Rename a few things...
+define constant <regex> = <parsed-regexp>;
+define constant <invalid-regex> = <illegal-regexp>;
+define constant invalid-regex-pattern = regexp-pattern;
+
+
+// Compile the given string into an optimized regular expression.
+//
+// @param case-sensitive -- Whether to be case sensivite.
+// @param verbose -- If true, allows you to write regular expressions that
+//   are easier to read by including whitespace and comments in them that
+//   will be ignored.
+//
+// @param multi-line -- If true, '^' matches at the beginning of the string and
+//   at the beginning of each line (immediately following each newline); and '$'
+//   matches at the end of the string and at the end of each line (immediately
+//   preceding each newline). By default, "^" matches only at the beginning of
+//   the string, and "$" only at the end of the string.
+//
+// @param dot-matches-newline -- Normally '.' matches any character except for
+//   newline.  If this parameter is true '.' matches newline as well.
+//
+// This function signals <invalid-regex> if the regular expression is invalid.
+//
+define sealed generic compile-regex
+    (string :: <string>,
+     #key case-sensitive  :: <boolean> = #t,
+          verbose         :: <boolean> = #f,
+          multi-line      :: <boolean> = #f,
+          dot-matches-all :: <boolean> = #f)
+ => (regex :: <regex>);
+
+define method compile-regex
+    (string :: <string>,
+     #key case-sensitive  :: <boolean> = #t,
+          verbose         :: <boolean> = #f,
+          multi-line      :: <boolean> = #f,
+          dot-matches-all :: <boolean> = #f)
+ => (regex :: <regex>)
+  parse(string,
+        make-parse-info(case-sensitive: case-sensitive,
+                        verbose: verbose,
+                        multi-line: multi-line,
+                        dot-matches-all: dot-matches-all))
+end;
+
+
+// Returns a <regex-match> containing info about a successful match, or #f if
+// no match was found.
+//
+// @param big -- The string in which to search.
+// @param pattern -- The pattern to search for.  If not a <regex>, it will be
+//   compiled first with compile-regex (implying that <invalid-regex> may be
+//   signalled), using the defaults for the keyword arguments.  If you wish
+//   to override them, call compile-regex directly.
+// @param anchored -- Whether or not the search should be anchored at the start
+//   position.  This is useful because "^..." will only match at the beginning
+//   of a string, or after \n if the regex was compiled with multi-line = #t.
+// @param start -- Where to begin the search.
+// @param end -- Where to stop searching.
+//
+// todo -- Should $ anchor at the provided end position or at the end of the string?
+//
+define sealed generic regex-search
+    (big :: <string>, pattern :: <object>,
+     #key anchored  :: <boolean> = #f,
+          start     :: <integer> = 0,
+          end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>));
+
+define method regex-search
+    (big :: <string>, pattern :: <string>,
+     #key anchored  :: <boolean> = #f,
+          start     :: <integer> = 0,
+          end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>))
+  regex-search(big, compile-regex(pattern),
+               anchored: anchored, start: start, end: _end)
+end method regex-search;
+
+define method regex-search
+    (big :: <string>, pattern :: <regex>,
+     #key anchored :: <boolean> = #f,
+          start    :: <integer> = 0,
+          end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>))
+  // Copied from regexp-position with some mods to match our interface.
+  // Unlike regexp-position there is no caching.  If you don't want to
+  // recompile your regex each time, compile it explicitly with compile-regex
+  // and save it.
+  let substring = make(<substring>, string: big, start: start, end: _end);
+  let case-sensitive? = #t;
+  let (matched?, marks)
+    = if (pattern.is-anchored?)
+        anchored-match-root?(pattern, substring, case-sensitive?, last-group + 1, #f);
+      else
+        let initial = pattern.initial-substring;
+        let searcher = ~initial.empty?
+          & make-substring-positioner(initial, case-sensitive: case-sensitive?);
+        match-root?(pattern, substring, case-sensitive?, last-group + 1, searcher);
+      end if;
+  if (matched?)
+    let regex-match = make(<regex-match>);
+    for (index from 0 by 2)
+      let bpos = marks[index];
+      let epos = marks[index + 1];
+      // It would be nice to make <substring> a real sequence, and possibly unify
+      // it with the substring implementation in Koala.
+      let text = copy-sequence(substring.entire-string,
+                               start: substring.start-index + bpos,
+                               end: substring.end-index + epos);
+      add-group(regex-match, make(<match-group>, text: text, start: bpos, end: epos));
+    end;
+    regex-match
+  else
+    #f
+  end
+end method regex-search;
+
+// This has methods for group :: <string> and group :: <integer>.
+// Group zero is always the entire match.
+define sealed generic regex-match-group
+    (match :: <regex-match>, group :: <object>)
+ => (text :: false-or(<string>),
+     start-index :: false-or(<integer>),
+     end-index :: false-or(<integer>));
+
+// How many groups matched?  There will always be at least one; the entire match.
+// (Maybe better to provide a way to iterate over the groups instead, but this
+// should be rarely used since you generally know what your groups are.)
+//
+define sealed generic regex-match-group-count
+    (match :: <regex-match>) => (count :: <integer>);
+
+
+define sealed class <match-group> (<object>)
+  constant slot group-text :: <string>,
+    required-init-keyword: text:;
+  constant slot group-start :: <integer>,
+    required-init-keyword: start:;
+  constant slot group-end :: <integer>,
+    required-init-keyword: end:;
+end class <match-group>;
+
+
+define sealed class <regex-match> (<object>)
+  // Groups by position.  Zero is the entire match.
+  constant slot group-vector :: <stretchy-vector> = make(<stretchy-vector>);
+  // Maps group names to positions.
+  constant slot group-table  :: <string-table> = make(<string-table>);
+end class <regex-match>;
+
+define method add-group
+    (match :: <regex-match>, group :: <match-group>,
+     #key name :: false-or(<string>))
+ => (match :: <regex-match>)
+  add!(match.group-vector, group);
+  if (name)
+    match.group-table[name] := group;
+  end;
+  match
+end;
+
+define sealed class <invalid-match-group> (<regex-error>)
+end class <invalid-match-group>;
+
+define method regex-match-group
+    (match :: <regex-match>, group :: <integer>)
+ => (text :: false-or(<string>),
+     start-index :: false-or(<integer>),
+     end-index :: false-or(<integer>))
+  if (0 <= group < match.group-vector.size)
+    match.group-vector[group]
+  else
+    signal(make(<invalid-match-group>,
+                format-string: "Match group index %d out of bounds.  Max group index is %d.",
+                format-arguments: list(group, match.group-vector.size - 1)));
+  end;
+end method regex-match-group;
+
+define method regex-match-group
+    (match :: <regex-match>, group :: <string>)
+ => (text :: false-or(<string>),
+     start-index :: false-or(<integer>),
+     end-index :: false-or(<integer>))
+  let index = element(match.group-table, group, default: #f);
+  if (index)
+    regex-match-group(match, index)
+  else
+    signal(make(<invalid-match-group>,
+                format-string: "There is no group named %=.",
+                format-arguments: list(group)));
+  end;
+end method regex-match-group;

Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	(original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	Mon Jun 18 03:27:19 2007
@@ -8,6 +8,10 @@
 end function re/position;
 
 define test atom-test ()
+  // In current code the empty string is an illegal regex, but Python
+  // (and probably perl?) allow it, so I think we should consider that
+  // a bug.  --cgay
+  check-no-errors("atom-0", re/position("", ""));
   check-equal("atom-1", re/position("a", "a"),      #[0, 1]);
   check-equal("atom-2", re/position("a", "[a]"),    #[0, 1]);
   check-equal("atom-3", re/position("ab", "(a)b"),  #[0, 2, 0, 1]);
@@ -26,7 +30,7 @@
   check-condition("atom-F", <illegal-regexp>, re/position("", "a{,"));
   check-condition("atom-G", <illegal-regexp>, re/position("", "[a"));
   check-condition("atom-H", <illegal-regexp>, re/position("", "\\"));
-  check-equal("atom-tan", "\<65>", "A");
+  //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
 end;
 
 



More information about the chatter mailing list