[Gd-chatter] r11507 - in trunk/libraries/regular-expressions: . tests

cgay at gwydiondylan.org cgay at gwydiondylan.org
Fri Nov 30 17:04:29 CET 2007


Author: cgay
Date: Fri Nov 30 17:04:28 2007
New Revision: 11507

Modified:
   trunk/libraries/regular-expressions/parse.dylan
   trunk/libraries/regular-expressions/regex.dylan
   trunk/libraries/regular-expressions/tests/regression-tests.txt
Log:
Bug: 7357
Implemented named groups (?P<name>...) and non-capturing groups (?:...).
Comments (?#...) are there but broken for the same reason () is broken.

Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan	(original)
+++ trunk/libraries/regular-expressions/parse.dylan	Fri Nov 30 17:04:28 2007
@@ -40,7 +40,11 @@
 //      <quantifier> ::= * | + | ? | {n} | {n,} | {n, m}
 //            (where n and m are decimal integers)
 //
-//      <atom> ::= (<regexp>) | <extended-character>
+//      <atom> ::= <subpattern> | <extended-character>
+//
+//      <subpattern> ::= (<options> <regexp>)
+//
+//      <options> ::= ?: | ?P<name> | ?P=name | ?# | etc
 //
 // See "Programming perl", p. 103-104 for more details.
 //
@@ -59,8 +63,16 @@
 
 // The root of the parsed regexp, i.e., this is what's returned by the parser.
 define class <regexp> (<mark>)
-  constant slot regexp-pattern :: <string>, required-init-keyword: #"pattern";
-  constant slot regexp-group-count :: <integer>, required-init-keyword: #"group-count";
+  // exported
+  constant slot regexp-pattern :: <string>,
+    required-init-keyword: pattern:;
+  // exported
+  constant slot regexp-group-count :: <integer>,
+    required-init-keyword: group-count:;
+  // internal.  This is only needed when making a <regexp-match> after
+  // a successful search.
+  constant slot group-number-to-name :: <table>,
+    required-init-keyword: group-number-to-name:;
 end class <regexp>;
 
 define class <union> (<parsed-regexp>)          //    |
@@ -125,20 +137,30 @@
               pattern: pattern));
 end function parse-error;
 
+define function not-yet-implemented (thing, #rest format-args)
+  let thing = if (~empty?(format-args))
+                apply(format-to-string, thing, format-args)
+              else
+                thing
+              end;
+  signal(make(<regexp-error>,
+              format-string: "The %s is not yet implemented.",
+              format-arguments: list(thing)));
+end;
+
 // <parse-info> contains some information about the current regexp
-// being parsed.  Using a structure is slightly nicer than having
-// global variables..
+// being parsed.
 //
 define class <parse-info> (<object>)
-  // Name this has-backreferences, for consistency with the other slots?
+  // Whether or not the function includes \1, \2, etc in the regexp.
+  // Name this has-backreferences, for consistency with the other slots.
   // Add ? to all the has-* slots.  --cgay
+  // Also, not sure why anyone cares about these three things.
   slot backreference-used :: <boolean>, init-value: #f;
-     // Whether or not the function includes \1, \2, etc in the regexp.
-     // This is different from return-marks, which determines whether the
-     // user wants to know about the marks.
   slot has-alternatives :: <boolean>, init-value: #f;
   slot has-quantifiers :: <boolean>, init-value: #f;
   slot current-group-number :: <integer>, init-value: 0;
+  constant slot group-number-to-name :: <table>, init-value: make(<table>);
   constant slot set-type :: <class>, required-init-keyword: #"set-type";
 end class <parse-info>;
 
@@ -148,14 +170,9 @@
           multi-line      :: <boolean> = #f,
           dot-matches-all :: <boolean> = #f)
  => (info :: <parse-info>)
-  local method nyi (option-name)
-          signal(make(<regexp-error>,
-                      format-string: "The '%s' option is not yet implemented.",
-                      format-arguments: list(option-name)));
-        end;
-  verbose & nyi("verbose");
-  multi-line & nyi("multi-line");
-  dot-matches-all & nyi("dot-matches-all");
+  verbose & not-yet-implemented("'verbose' option");
+  multi-line & not-yet-implemented("'multi-line' option");
+  dot-matches-all & not-yet-implemented("'dot-matches-all' option");
   let char-set-type
    = if (case-sensitive)
        <case-sensitive-character-set>
@@ -165,6 +182,18 @@
   make(<parse-info>, set-type: char-set-type)
 end function make-parse-info;
 
+define method has-named-group?
+    (info :: <parse-info>, name :: <string>)
+  block (return)
+    for (group-name in info.group-number-to-name)
+      if (name = group-name)
+        return(#t)
+      end;
+    end;
+    #f
+  end;
+end;
+
 define method parse
     (regexp :: <string>, parse-info :: <parse-info>)
  => (parsed-regexp :: <parsed-regexp>,
@@ -176,12 +205,13 @@
   let child = parse-regexp(parse-string, parse-info);
   let parse-tree = make(<regexp>,
                         pattern: regexp,
-                        group-count: parse-info.current-group-number + 1,
                         group: 0,
+                        group-count: parse-info.current-group-number + 1,
+                        group-number-to-name: parse-info.group-number-to-name,
 			child: child);
   let optimized-regexp = optimize(parse-tree);
   if (optimized-regexp.pathological?)
-    parse-error(regexp, "A sub-regexp that matches the empty string was quantified.");
+    parse-error(regexp, "A subpattern that matches the empty string was quantified.");
   else
     values(optimized-regexp,
 	   parse-info.current-group-number,
@@ -217,6 +247,9 @@
 
 define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
  => (result :: false-or(<parsed-regexp>))
+  // I think this breaks when parse-atom returns #f and then we quantify that.
+  // I added some regexes to regression-tests.txt starting with /a()b/ that I
+  // hope will exercise that case.  --cgay
   let atom = parse-atom(s, info);
   let char = lookahead(s);
   select (char by \=)
@@ -321,21 +354,119 @@
   end select;
 end method parse-atom;
 
+// Parse a subpattern, a.k.a. "group".  i.e., something delimited by parens.
+// The parse string is pointing at the character after the '('.
+//
 define inline function parse-group
-    (s :: <parse-string>, info :: <parse-info>)
- => (mark :: <mark>)
-  info.current-group-number := info.current-group-number + 1;
-  let this-group = info.current-group-number;
-  let regexp = parse-regexp(s, info);
-  if (lookahead(s) = ')')
-    consume(s);   // Consume ')'
-    make(<mark>, child: regexp, group: this-group)
+    (str :: <parse-string>, info :: <parse-info>)
+ => (mark :: false-or(<parsed-regexp>))
+  let char = lookahead(str);
+  if (char == '?')
+    consume(str);
+    parse-extended-group(str, info)
   else
-    parse-error(s.parse-string, "Unbalanced parens in regexp (index = %s).",
-                s.parse-index);
-  end;
+    parse-simple-group(str, info, #t, #f)
+  end
 end function parse-group;
 
+// Just saw "(?" so we need to parse a group with extended options.
+//
+define inline function parse-extended-group
+    (str :: <parse-string>, info :: <parse-info>)
+ => (mark :: false-or(<parsed-regexp>))
+  let char = lookahead(str);
+  consume(str);
+  select (char)
+    'P' =>                    // (?P named group constructs
+      let char = lookahead(str);
+      consume(str);
+      if (char == '=')
+        not-yet-implemented("(?P=name) construct");
+      elseif (char = '<')
+        parse-simple-group(str, info, #t, parse-group-name(str, info))
+      else
+        parse-error(str.parse-string,
+                    "Invalid named group syntax (index = %s).",
+                    str.parse-index);
+      end;
+
+    ':' =>                    // (?: doesn't save the group
+      parse-simple-group(str, info, #f, #f);
+
+    '#' =>                    // (?# for comments
+      while (lookahead(str) & lookahead(str) ~== ')')
+        consume(str);
+      end;
+      if (~ lookahead(str))
+        parse-error(str.parse-string, "Unterminated (?# comment.");
+      else
+        #f
+      end;
+
+    otherwise =>
+      // See the Python re docs for what all these do.
+      if (member?(char, "iLmsux#=!<("))
+        not-yet-implemented("'(?%c' subpattern construct", lookahead(str));
+      else
+        parse-error(str.parse-string, "Invalid (? construct at index %s.",
+                    str.parse-index);
+      end;
+  end select
+end function parse-extended-group;
+
+// Just saw "(?P<", so parse the name of this named group.
+//
+define function parse-group-name
+    (str :: <parse-string>, info :: <parse-info>)
+ => (name :: <string>)
+  let start-index = str.parse-index;
+  while (lookahead(str) & lookahead(str) ~== '>')
+    consume(str);
+  end;
+  if (lookahead(str) == '>')
+    consume(str);
+    copy-sequence(str.parse-string, start: start-index, end: str.parse-index - 1)
+  else
+    parse-error(str.parse-string, "Unterminated named group name at index %s.",
+                str.parse-index);
+  end
+end function parse-group-name;
+
+// Parse a group/subpattern, possibly after having already parsed any
+// options given via "(?...".
+//
+define inline function parse-simple-group
+    (str :: <parse-string>,
+     info :: <parse-info>,
+     save-group? :: <boolean>,
+     group-name :: false-or(<string>))
+ => (mark :: false-or(<parsed-regexp>))
+  if (save-group?)
+    info.current-group-number := info.current-group-number + 1;
+  end;
+  let regexp = parse-regexp(str, info);
+  if (lookahead(str) ~== ')')
+    parse-error(str.parse-string, "Unbalanced parens in regexp (index = %s).",
+                str.parse-index);
+  else
+    consume(str);
+    if (~ save-group?)
+      regexp
+    else
+      if (group-name)
+        if (has-named-group?(info, group-name))
+          parse-error(str.parse-string,
+                      "Duplicate group name (%s) at index %s.",
+                      group-name, str.parse-index);
+        else
+          info.group-number-to-name[info.current-group-number] := group-name;
+        end;
+      end;
+      make(<mark>, child: regexp, group: info.current-group-number)
+    end
+  end
+end function parse-simple-group;
+
 // This just does a quick scan to find the closing ] and then lets
 // make(<character-set>) do the real parsing.
 //
@@ -542,8 +673,8 @@
 // pathological regexp:
 //
 // First, realize that pathological regexps stem from infinitely
-// quantifying subregexps that could match the empty string.  So what
-// we do is find this subregexps, and perform the following
+// quantifying subpatterns that could match the empty string.  So what
+// we do is find this subpattern, and perform the following
 // transformation:
 //
 //  case (type of regexp)

Modified: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- trunk/libraries/regular-expressions/regex.dylan	(original)
+++ trunk/libraries/regular-expressions/regex.dylan	Fri Nov 30 17:04:28 2007
@@ -108,7 +108,10 @@
       end if;
   if (matched?)
     let regexp-match = make(<regexp-match>, regular-expression: pattern);
+    let group-number-to-name :: <table> = pattern.group-number-to-name;
     for (index from 0 below marks.size by 2)
+      let group-number = floor/(index, 2);
+      let group-name = element(group-number-to-name, group-number, default: #f);
       let bpos = marks[index];
       let epos = marks[index + 1];
       if (bpos & epos)
@@ -117,10 +120,12 @@
         let text = copy-sequence(substring.entire-string,
                                  start: substring.start-index + bpos,
                                  end: substring.start-index + epos);
-        add-group(regexp-match, make(<match-group>, text: text, start: bpos, end: epos));
+        add-group(regexp-match,
+                  make(<match-group>, text: text, start: bpos, end: epos),
+                  group-name);
       else
         // This group wasn't matched.
-        add-group(regexp-match, #f)
+        add-group(regexp-match, #f, group-name);
       end;
     end;
     regexp-match
@@ -166,8 +171,9 @@
 end class <regexp-match>;
 
 define method add-group
-    (match :: <regexp-match>, group :: false-or(<match-group>),
-     #key name :: false-or(<string>))
+    (match :: <regexp-match>,
+     group :: false-or(<match-group>),
+     name :: false-or(<string>))
  => (match :: <regexp-match>)
   add!(match.groups-by-position, group);
   if (name)
@@ -210,12 +216,12 @@
  => (text :: false-or(<string>),
      start-index :: false-or(<integer>),
      end-index :: false-or(<integer>))
-  let index :: <integer> = element(match.groups-by-name, group, default: #f);
-  if (index)
-    regexp-match-group(match, index)
+  let group = element(match.groups-by-name, group, default: #f);
+  if (group)
+    values(group.group-text, group.group-start, group.group-end)
   else
     signal(make(<invalid-match-group>,
                 format-string: "There is no group named %=.",
                 format-arguments: list(group)));
-  end;
+  end
 end method regexp-match-group;

Modified: trunk/libraries/regular-expressions/tests/regression-tests.txt
==============================================================================
--- trunk/libraries/regular-expressions/tests/regression-tests.txt	(original)
+++ trunk/libraries/regular-expressions/tests/regression-tests.txt	Fri Nov 30 17:04:28 2007
@@ -20,3 +20,9 @@
     b]b
 No match
 
+/a()b/
+
+/a()+b/
+
+/a(#blah)b/
+



More information about the chatter mailing list