[Gd-chatter] r11471 - in trunk/libraries/regular-expressions: . tests

cgay at gwydiondylan.org cgay at gwydiondylan.org
Sat Oct 27 22:33:02 CEST 2007


Author: cgay
Date: Sat Oct 27 22:33:00 2007
New Revision: 11471

Added:
   trunk/libraries/regular-expressions/tests/new-api-test-suite.dylan   (contents, props changed)
   trunk/libraries/regular-expressions/tests/old-api-test-suite.dylan   (contents, props changed)
Modified:
   trunk/libraries/regular-expressions/interface.dylan
   trunk/libraries/regular-expressions/match.dylan
   trunk/libraries/regular-expressions/od-library.dylan
   trunk/libraries/regular-expressions/parse.dylan
   trunk/libraries/regular-expressions/regex.dylan
   trunk/libraries/regular-expressions/tests/library.dylan
   trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
   trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
Log:
job: 7357
A little more progress on regular expressions library...
* Switched from "regex" to "regexp" nomenclature.
* Added a <regexp> class which is now the type returned by "parse" and
  contains the original pattern plus the group count.  The matcher wants
  to use the latter.
* Fixed bugs in regexp-search.


Modified: trunk/libraries/regular-expressions/interface.dylan
==============================================================================
--- trunk/libraries/regular-expressions/interface.dylan	(original)
+++ trunk/libraries/regular-expressions/interface.dylan	Sat Oct 27 22:33:00 2007
@@ -106,8 +106,8 @@
  => (equal? :: <function>, hash :: <function>);
   values(method (key1 :: <cache-key>, key2 :: <cache-key>) // equal?
 	  => res :: <boolean>;
-	   key1.regexp-string == key2.regexp-string
-	     & key1.character-set-type == key2.character-set-type;
+           key1.regexp-string = key2.regexp-string
+             & key1.character-set-type == key2.character-set-type;
 	 end method,
 	 method (key :: <cache-key>, initial-state) => (id :: <integer>, state); // hash()
 	   let (string-id, string-state) = object-hash(key.regexp-string, initial-state);

Modified: trunk/libraries/regular-expressions/match.dylan
==============================================================================
--- trunk/libraries/regular-expressions/match.dylan	(original)
+++ trunk/libraries/regular-expressions/match.dylan	Sat Oct 27 22:33:00 2007
@@ -108,7 +108,7 @@
 	      descend-re(re, target, case-sensitive?, index,
 			 marks, fail, list(up-proc));
 	      error("A regexp should either match or not match. Why did it "
-		      "reach this piece of code?");
+                    "reach this piece of code?");
 	    end block;
 	  end for;
 	  values(#f);      // Failure

Modified: trunk/libraries/regular-expressions/od-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/od-library.dylan	(original)
+++ trunk/libraries/regular-expressions/od-library.dylan	Sat Oct 27 22:33:00 2007
@@ -38,37 +38,38 @@
   use common-dylan;
   use string-extensions;
   export
-    regex,                                             // new API
+    regexp,                                            // new API
     regular-expressions;                               // old API
 end library regular-expressions;
 
-define module regex                  // new API module
+define module regexp                  // new API module
   create
-    compile-regex,
-    regex-search,
-    <regex>,
-    <invalid-regex>,
-      invalid-regex-pattern,
-    <regex-match>,                   // results of a successful search
-      regex-match-group,
-      regex-match-group-count,
+    compile-regexp,
+    regexp-search,
+    <regexp>,
+    <invalid-regexp>,
+      invalid-regexp-pattern,
+    <regexp-match>,                   // results of a successful search
+      regexp-match-group,
+      regexp-match-groups,
       group-start,
       group-end,
       group-text,
       <invalid-match-group>;
-end module regex;
+end module regexp;
 
 define module regular-expressions    // old API module
   create
     regexp-position, make-regexp-positioner,
     regexp-match,
     regexp-replace, make-regexp-replacer,
+    regexp-group-count,
     translate, make-translator,
     split, make-splitter,
     join,
     <illegal-regexp>,
-      regexp-pattern;
-
+      regexp-pattern,
+    <regexp-error>;
   create
     split-string;
 end module regular-expressions;
@@ -82,6 +83,7 @@
   use %do-replacement;
   use %parse-string;
   use substring-search;
-  use regular-expressions;                             // API module
-  use regex;
+  use regular-expressions;                      // old API module
+  use regexp;                                   // new API module
 end module regular-expressions-impl;
+

Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan	(original)
+++ trunk/libraries/regular-expressions/parse.dylan	Sat Oct 27 22:33:00 2007
@@ -57,6 +57,12 @@
   constant slot group-number :: <integer>, required-init-keyword: #"group";
 end class <mark>;
 
+// The root of the parsed regexp
+define class <regexp> (<mark>)
+  constant slot regexp-pattern :: <string>, required-init-keyword: #"pattern";
+  constant slot regexp-group-count :: <integer>, required-init-keyword: #"group-count";
+end class <regexp>;
+
 define class <union> (<parsed-regexp>)          //    |
   slot left  :: <parsed-regexp>, required-init-keyword: #"left";
   slot right :: <parsed-regexp>, required-init-keyword: #"right";
@@ -99,10 +105,10 @@
 end class <parsed-backreference>;
 
 // Note: I'm pretty sure <simple-error> won't work in GD.  --cgay
-define class <regex-error> (<simple-error>)
-end class <regex-error>;
+define class <regexp-error> (<simple-error>)
+end class <regexp-error>;
 
-define class <illegal-regexp> (<regex-error>)
+define class <illegal-regexp> (<regexp-error>)
   constant slot regexp-pattern :: <string>, 
     required-init-keyword: #"pattern";
 end class <illegal-regexp>;
@@ -143,7 +149,7 @@
           dot-matches-all :: <boolean> = #f)
  => (info :: <parse-info>)
   local method nyi (option-name)
-          signal(make(<regex-error>,
+          signal(make(<regexp-error>,
                       format-string: "The '%s' option is not yet implemented.",
                       format-arguments: list(option-name)));
         end;
@@ -167,8 +173,12 @@
      alternatives? :: <boolean>, 
      quantifiers? :: <boolean>)
   let parse-string = make(<parse-string>, string: regexp);
-  let parse-tree = make(<mark>, group: 0, 
-			child: parse-regexp(parse-string, parse-info));
+  let child = parse-regexp(parse-string, parse-info);
+  let parse-tree = make(<regexp>,
+                        pattern: regexp,
+                        group-count: parse-info.current-group-number + 1,
+                        group: 0,
+			child: child);
   let optimized-regexp = optimize(parse-tree);
   if (optimized-regexp.pathological?)
     parse-error(regexp, "A sub-regexp that matches the empty string was quantified.");

Modified: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- trunk/libraries/regular-expressions/regex.dylan	(original)
+++ trunk/libraries/regular-expressions/regex.dylan	Sat Oct 27 22:33:00 2007
@@ -4,9 +4,8 @@
 
 
 // Rename a few things...
-define constant <regex> = <parsed-regexp>;
-define constant <invalid-regex> = <illegal-regexp>;
-define constant invalid-regex-pattern = regexp-pattern;
+define constant <invalid-regexp> = <illegal-regexp>;
+define constant invalid-regexp-pattern = regexp-pattern;
 
 
 // Compile the given string into an optimized regular expression.
@@ -25,118 +24,121 @@
 // @param dot-matches-newline -- Normally '.' matches any character except for
 //   newline.  If this parameter is true '.' matches newline as well.
 //
-// This function signals <invalid-regex> if the regular expression is invalid.
+// This function signals <invalid-regexp> if the regular expression is invalid.
 //
-define sealed generic compile-regex
+define sealed generic compile-regexp
     (string :: <string>,
      #key case-sensitive  :: <boolean> = #t,
           verbose         :: <boolean> = #f,
           multi-line      :: <boolean> = #f,
           dot-matches-all :: <boolean> = #f)
- => (regex :: <regex>);
+ => (regexp :: <regexp>);
 
-define method compile-regex
+define method compile-regexp
     (string :: <string>,
      #key case-sensitive  :: <boolean> = #t,
           verbose         :: <boolean> = #f,
           multi-line      :: <boolean> = #f,
           dot-matches-all :: <boolean> = #f)
- => (regex :: <regex>)
+ => (regexp :: <regexp>)
   parse(string,
         make-parse-info(case-sensitive: case-sensitive,
                         verbose: verbose,
                         multi-line: multi-line,
                         dot-matches-all: dot-matches-all))
-end;
+end method compile-regexp;
 
 
-// Returns a <regex-match> containing info about a successful match, or #f if
+// Returns a <regexp-match> containing info about a successful match, or #f if
 // no match was found.
 //
 // @param big -- The string in which to search.
-// @param pattern -- The pattern to search for.  If not a <regex>, it will be
-//   compiled first with compile-regex (implying that <invalid-regex> may be
+// @param pattern -- The pattern to search for.  If not a <regexp>, it will be
+//   compiled first with compile-regexp (implying that <invalid-regexp> may be
 //   signalled), using the defaults for the keyword arguments.  If you wish
-//   to override them, call compile-regex directly.
+//   to override them, call compile-regexp directly.
 // @param anchored -- Whether or not the search should be anchored at the start
 //   position.  This is useful because "^..." will only match at the beginning
-//   of a string, or after \n if the regex was compiled with multi-line = #t.
+//   of a string, or after \n if the regexp was compiled with multi-line = #t.
 // @param start -- Where to begin the search.
 // @param end -- Where to stop searching.
 //
 // todo -- Should $ anchor at the provided end position or at the end of the string?
 //
-define sealed generic regex-search
+define sealed generic regexp-search
     (big :: <string>, pattern :: <object>,
      #key anchored  :: <boolean> = #f,
           start     :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regex-match>));
+ => (match :: false-or(<regexp-match>));
 
-define method regex-search
+define method regexp-search
     (big :: <string>, pattern :: <string>,
      #key anchored  :: <boolean> = #f,
           start     :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regex-match>))
-  regex-search(big, compile-regex(pattern),
+ => (match :: false-or(<regexp-match>))
+  regexp-search(big, compile-regexp(pattern),
                anchored: anchored, start: start, end: _end)
-end method regex-search;
+end method regexp-search;
 
-define method regex-search
-    (big :: <string>, pattern :: <regex>,
+define method regexp-search
+    (big :: <string>, pattern :: <regexp>,
      #key anchored :: <boolean> = #f,
           start    :: <integer> = 0,
           end: _end :: <integer> = big.size)
- => (match :: false-or(<regex-match>))
+ => (match :: false-or(<regexp-match>))
   // Copied from regexp-position with some mods to match our interface.
   // Unlike regexp-position there is no caching.  If you don't want to
-  // recompile your regex each time, compile it explicitly with compile-regex
+  // recompile your regexp each time, compile it explicitly with compile-regexp
   // and save it.
   let substring = make(<substring>, string: big, start: start, end: _end);
   let case-sensitive? = #t;
+  let num-groups = pattern.regexp-group-count;
   let (matched?, marks)
     = if (pattern.is-anchored?)
-        anchored-match-root?(pattern, substring, case-sensitive?, last-group + 1, #f);
+        anchored-match-root?(pattern, substring, case-sensitive?, num-groups, #f);
       else
         let initial = pattern.initial-substring;
         let searcher = ~initial.empty?
           & make-substring-positioner(initial, case-sensitive: case-sensitive?);
-        match-root?(pattern, substring, case-sensitive?, last-group + 1, searcher);
+        match-root?(pattern, substring, case-sensitive?, num-groups, searcher);
       end if;
   if (matched?)
-    let regex-match = make(<regex-match>);
-    for (index from 0 by 2)
+    let regexp-match = make(<regexp-match>);
+    for (index from 0 below marks.size by 2)
       let bpos = marks[index];
       let epos = marks[index + 1];
       // It would be nice to make <substring> a real sequence, and possibly unify
       // it with the substring implementation in Koala.
       let text = copy-sequence(substring.entire-string,
                                start: substring.start-index + bpos,
-                               end: substring.end-index + epos);
-      add-group(regex-match, make(<match-group>, text: text, start: bpos, end: epos));
+                               end: substring.start-index + epos);
+      add-group(regexp-match, make(<match-group>, text: text, start: bpos, end: epos));
     end;
-    regex-match
+    regexp-match
   else
     #f
   end
-end method regex-search;
+end method regexp-search;
 
 // This has methods for group :: <string> and group :: <integer>.
 // Group zero is always the entire match.
-define sealed generic regex-match-group
-    (match :: <regex-match>, group :: <object>)
+define sealed generic regexp-match-group
+    (match :: <regexp-match>, group :: <object>)
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>));
 
-// How many groups matched?  There will always be at least one; the entire match.
-// (Maybe better to provide a way to iterate over the groups instead, but this
-// should be rarely used since you generally know what your groups are.)
+// Get the groups for the match.  There will always be at least one; the entire match.
 //
-define sealed generic regex-match-group-count
-    (match :: <regex-match>) => (count :: <integer>);
+define sealed generic regexp-match-groups
+    (match :: <regexp-match>) => (groups :: <sequence>);
 
+define method regexp-match-groups
+    (match :: <regexp-match>) => (groups :: <sequence>)
+  map-as(<simple-object-vector>, identity, match.groups-by-position)
+end;
 
 define sealed class <match-group> (<object>)
   constant slot group-text :: <string>,
@@ -147,53 +149,52 @@
     required-init-keyword: end:;
 end class <match-group>;
 
-
-define sealed class <regex-match> (<object>)
+define sealed class <regexp-match> (<object>)
   // Groups by position.  Zero is the entire match.
-  constant slot group-vector :: <stretchy-vector> = make(<stretchy-vector>);
-  // Maps group names to positions.
-  constant slot group-table  :: <string-table> = make(<string-table>);
-end class <regex-match>;
+  constant slot groups-by-position :: <stretchy-vector> = make(<stretchy-vector>);
+  constant slot groups-by-name :: <string-table> = make(<string-table>);
+end class <regexp-match>;
 
 define method add-group
-    (match :: <regex-match>, group :: <match-group>,
+    (match :: <regexp-match>, group :: <match-group>,
      #key name :: false-or(<string>))
- => (match :: <regex-match>)
-  add!(match.group-vector, group);
+ => (match :: <regexp-match>)
+  add!(match.groups-by-position, group);
   if (name)
-    match.group-table[name] := group;
+    match.groups-by-name[name] := group;
   end;
   match
 end;
 
-define sealed class <invalid-match-group> (<regex-error>)
+define sealed class <invalid-match-group> (<regexp-error>)
 end class <invalid-match-group>;
 
-define method regex-match-group
-    (match :: <regex-match>, group :: <integer>)
+define method regexp-match-group
+    (match :: <regexp-match>, group-number :: <integer>)
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>))
-  if (0 <= group < match.group-vector.size)
-    match.group-vector[group]
+  if (0 <= group-number & group-number < match.groups-by-position.size)
+    let group = match.groups-by-position[group-number];
+    values(group.group-text, group.group-start, group.group-end)
   else
     signal(make(<invalid-match-group>,
                 format-string: "Match group index %d out of bounds.  Max group index is %d.",
-                format-arguments: list(group, match.group-vector.size - 1)));
+                format-arguments: list(group-number, match.groups-by-position.size - 1)));
   end;
-end method regex-match-group;
+end method regexp-match-group;
 
-define method regex-match-group
-    (match :: <regex-match>, group :: <string>)
+define method regexp-match-group
+    (match :: <regexp-match>, group :: <string>)
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>))
-  let index = element(match.group-table, group, default: #f);
+  let index :: <integer> = element(match.groups-by-name, group, default: #f);
   if (index)
-    regex-match-group(match, index)
+    regexp-match-group(match, index)
   else
     signal(make(<invalid-match-group>,
                 format-string: "There is no group named %=.",
                 format-arguments: list(group)));
   end;
-end method regex-match-group;
+end method regexp-match-group;

Modified: trunk/libraries/regular-expressions/tests/library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/library.dylan	(original)
+++ trunk/libraries/regular-expressions/tests/library.dylan	Sat Oct 27 22:33:00 2007
@@ -3,18 +3,32 @@
 define library regular-expressions-test-suite
   use common-dylan;
   use testworks;
-  use regular-expressions;
-
+  use regular-expressions,
+    import: { regular-expressions, regexp };
   export
     regular-expressions-test-suite;
 end;
 
-define module regular-expressions-test-suite
+define module old-api-test-suite
   use common-dylan, exclude: { split };
   use regular-expressions;
   use testworks;
-
   export
     //pcre-test-suite,
-    regular-expressions-test-suite;
+    old-api-test-suite;
+end;
+
+define module new-api-test-suite
+  use common-dylan, exclude: { split };
+  use regexp;
+  use testworks;
+  export
+    new-api-test-suite;
+end;
+
+define module regular-expressions-test-suite
+  use testworks;
+  use old-api-test-suite;
+  use new-api-test-suite;
+  export regular-expressions-test-suite;
 end;

Added: trunk/libraries/regular-expressions/tests/new-api-test-suite.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/tests/new-api-test-suite.dylan	Sat Oct 27 22:33:00 2007
@@ -0,0 +1,24 @@
+Module: new-api-test-suite
+Author: Carl Gay
+
+
+define test groups-test ()
+  let text = "My dog has fleas.";
+  let match = regexp-search(text, "My (dog)");
+
+  // The entire match is a group as well, so 2 groups.
+  check-equal("number of groups.", 2, regexp-match-groups(match).size);
+
+  let (txt, bpos, epos) = regexp-match-group(match, 1);
+  check-equal("group text", "dog", txt);
+  check-equal("group start", 3, bpos);
+  check-equal("group end", 6, epos);
+end test groups-test;
+
+define suite new-api-test-suite ()
+  test groups-test;
+//  test and-test;
+//  test or-test;
+//  test anchoring-test;
+//  test quantifier-test;
+end suite new-api-test-suite;

Added: trunk/libraries/regular-expressions/tests/old-api-test-suite.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/tests/old-api-test-suite.dylan	Sat Oct 27 22:33:00 2007
@@ -0,0 +1,43 @@
+Module: old-api-test-suite
+Author: Carl Gay
+
+
+define function re/position (string, pattern, #rest args)
+  let (#rest marks) = apply(regexp-position, string, pattern, args);
+  marks
+end function re/position;
+
+define test atom-test ()
+  // In current code the empty string is an illegal regex, but Python
+  // (and probably perl?) allow it, so I think we should consider that
+  // a bug.  --cgay
+  check-no-errors("atom-0", re/position("", ""));
+  check-equal("atom-1", re/position("a", "a"),      #[0, 1]);
+  check-equal("atom-2", re/position("a", "[a]"),    #[0, 1]);
+  check-equal("atom-3", re/position("ab", "(a)b"),  #[0, 2, 0, 1]);
+  check-equal("atom-4", re/position("a", "\\w"),    #[0, 1]);
+  check-equal("atom-5", re/position("a", "."),      #[0, 1]);
+  check-equal("atom-6", re/position("a", "a{0}"),   #[0, 0]);
+  check-equal("atom-7", re/position("aa", "a{2}"),  #[0, 2]);
+  check-equal("atom-8", re/position("aa", "a{1,}"), #[0, 2]);
+  check-equal("atom-9", re/position("aaa", "a{1,8}"), #[0, 3]);
+  check-equal("atom-A", re/position("", "a{,}"),   #[0, 0]);
+  check-equal("atom-A1", re/position("aaaaaa", "a{,}"),   #[0, 6]);
+  check-condition("atom-B", <illegal-regexp>, re/position("", "a{m,n}"));
+  check-condition("atom-C", <illegal-regexp>, re/position("", "a{m,}"));
+  check-condition("atom-D", <illegal-regexp>, re/position("", "a{,n}"));
+  check-condition("atom-E", <illegal-regexp>, re/position("", "a{m}"));
+  check-condition("atom-F", <illegal-regexp>, re/position("", "a{,"));
+  check-condition("atom-G", <illegal-regexp>, re/position("", "[a"));
+  check-condition("atom-H", <illegal-regexp>, re/position("", "\\"));
+  //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
+end;
+
+
+define suite old-api-test-suite ()
+  test atom-test;
+//  test and-test;
+//  test or-test;
+//  test anchoring-test;
+//  test quantifier-test;
+end;

Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	(original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan	Sat Oct 27 22:33:00 2007
@@ -2,42 +2,7 @@
 Author: Carl Gay
 
 
-define function re/position (string, pattern, #rest args)
-  let (#rest marks) = apply(regexp-position, string, pattern, args);
-  marks
-end function re/position;
-
-define test atom-test ()
-  // In current code the empty string is an illegal regex, but Python
-  // (and probably perl?) allow it, so I think we should consider that
-  // a bug.  --cgay
-  check-no-errors("atom-0", re/position("", ""));
-  check-equal("atom-1", re/position("a", "a"),      #[0, 1]);
-  check-equal("atom-2", re/position("a", "[a]"),    #[0, 1]);
-  check-equal("atom-3", re/position("ab", "(a)b"),  #[0, 2, 0, 1]);
-  check-equal("atom-4", re/position("a", "\\w"),    #[0, 1]);
-  check-equal("atom-5", re/position("a", "."),      #[0, 1]);
-  check-equal("atom-6", re/position("a", "a{0}"),   #[0, 0]);
-  check-equal("atom-7", re/position("aa", "a{2}"),  #[0, 2]);
-  check-equal("atom-8", re/position("aa", "a{1,}"), #[0, 2]);
-  check-equal("atom-9", re/position("aaa", "a{1,8}"), #[0, 3]);
-  check-equal("atom-A", re/position("", "a{,}"),   #[0, 0]);
-  check-equal("atom-A1", re/position("aaaaaa", "a{,}"),   #[0, 6]);
-  check-condition("atom-B", <illegal-regexp>, re/position("", "a{m,n}"));
-  check-condition("atom-C", <illegal-regexp>, re/position("", "a{m,}"));
-  check-condition("atom-D", <illegal-regexp>, re/position("", "a{,n}"));
-  check-condition("atom-E", <illegal-regexp>, re/position("", "a{m}"));
-  check-condition("atom-F", <illegal-regexp>, re/position("", "a{,"));
-  check-condition("atom-G", <illegal-regexp>, re/position("", "[a"));
-  check-condition("atom-H", <illegal-regexp>, re/position("", "\\"));
-  //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
-end;
-
-
 define suite regular-expressions-test-suite ()
-  test atom-test;
-//  test and-test;
-//  test or-test;
-//  test anchoring-test;
-//  test quantifier-test;
+  suite old-api-test-suite;
+  suite new-api-test-suite;
 end;

Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid	(original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid	Sat Oct 27 22:33:00 2007
@@ -1,3 +1,5 @@
 library: regular-expressions-test-suite
 files: library
        regular-expressions-test-suite
+       old-api-test-suite
+       new-api-test-suite



More information about the chatter mailing list