[Gd-chatter] r11507 - in trunk/libraries/regular-expressions: . tests
cgay at gwydiondylan.org
cgay at gwydiondylan.org
Fri Nov 30 17:04:29 CET 2007
Author: cgay
Date: Fri Nov 30 17:04:28 2007
New Revision: 11507
Modified:
trunk/libraries/regular-expressions/parse.dylan
trunk/libraries/regular-expressions/regex.dylan
trunk/libraries/regular-expressions/tests/regression-tests.txt
Log:
Bug: 7357
Implemented named groups (?P<name>...) and non-capturing groups (?:...).
Comments (?#...) are there but broken for the same reason () is broken.
Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan (original)
+++ trunk/libraries/regular-expressions/parse.dylan Fri Nov 30 17:04:28 2007
@@ -40,7 +40,11 @@
// <quantifier> ::= * | + | ? | {n} | {n,} | {n, m}
// (where n and m are decimal integers)
//
-// <atom> ::= (<regexp>) | <extended-character>
+// <atom> ::= <subpattern> | <extended-character>
+//
+// <subpattern> ::= (<options> <regexp>)
+//
+// <options> ::= ?: | ?P<name> | ?P=name | ?# | etc
//
// See "Programming perl", p. 103-104 for more details.
//
@@ -59,8 +63,16 @@
// The root of the parsed regexp, i.e., this is what's returned by the parser.
define class <regexp> (<mark>)
- constant slot regexp-pattern :: <string>, required-init-keyword: #"pattern";
- constant slot regexp-group-count :: <integer>, required-init-keyword: #"group-count";
+ // exported
+ constant slot regexp-pattern :: <string>,
+ required-init-keyword: pattern:;
+ // exported
+ constant slot regexp-group-count :: <integer>,
+ required-init-keyword: group-count:;
+ // internal. This is only needed when making a <regexp-match> after
+ // a successful search.
+ constant slot group-number-to-name :: <table>,
+ required-init-keyword: group-number-to-name:;
end class <regexp>;
define class <union> (<parsed-regexp>) // |
@@ -125,20 +137,30 @@
pattern: pattern));
end function parse-error;
+define function not-yet-implemented (thing, #rest format-args)
+ let thing = if (~empty?(format-args))
+ apply(format-to-string, thing, format-args)
+ else
+ thing
+ end;
+ signal(make(<regexp-error>,
+ format-string: "The %s is not yet implemented.",
+ format-arguments: list(thing)));
+end;
+
// <parse-info> contains some information about the current regexp
-// being parsed. Using a structure is slightly nicer than having
-// global variables..
+// being parsed.
//
define class <parse-info> (<object>)
- // Name this has-backreferences, for consistency with the other slots?
+ // Whether or not the function includes \1, \2, etc in the regexp.
+ // Name this has-backreferences, for consistency with the other slots.
// Add ? to all the has-* slots. --cgay
+ // Also, not sure why anyone cares about these three things.
slot backreference-used :: <boolean>, init-value: #f;
- // Whether or not the function includes \1, \2, etc in the regexp.
- // This is different from return-marks, which determines whether the
- // user wants to know about the marks.
slot has-alternatives :: <boolean>, init-value: #f;
slot has-quantifiers :: <boolean>, init-value: #f;
slot current-group-number :: <integer>, init-value: 0;
+ constant slot group-number-to-name :: <table>, init-value: make(<table>);
constant slot set-type :: <class>, required-init-keyword: #"set-type";
end class <parse-info>;
@@ -148,14 +170,9 @@
multi-line :: <boolean> = #f,
dot-matches-all :: <boolean> = #f)
=> (info :: <parse-info>)
- local method nyi (option-name)
- signal(make(<regexp-error>,
- format-string: "The '%s' option is not yet implemented.",
- format-arguments: list(option-name)));
- end;
- verbose & nyi("verbose");
- multi-line & nyi("multi-line");
- dot-matches-all & nyi("dot-matches-all");
+ verbose & not-yet-implemented("'verbose' option");
+ multi-line & not-yet-implemented("'multi-line' option");
+ dot-matches-all & not-yet-implemented("'dot-matches-all' option");
let char-set-type
= if (case-sensitive)
<case-sensitive-character-set>
@@ -165,6 +182,18 @@
make(<parse-info>, set-type: char-set-type)
end function make-parse-info;
+define method has-named-group?
+ (info :: <parse-info>, name :: <string>)
+ block (return)
+ for (group-name in info.group-number-to-name)
+ if (name = group-name)
+ return(#t)
+ end;
+ end;
+ #f
+ end;
+end;
+
define method parse
(regexp :: <string>, parse-info :: <parse-info>)
=> (parsed-regexp :: <parsed-regexp>,
@@ -176,12 +205,13 @@
let child = parse-regexp(parse-string, parse-info);
let parse-tree = make(<regexp>,
pattern: regexp,
- group-count: parse-info.current-group-number + 1,
group: 0,
+ group-count: parse-info.current-group-number + 1,
+ group-number-to-name: parse-info.group-number-to-name,
child: child);
let optimized-regexp = optimize(parse-tree);
if (optimized-regexp.pathological?)
- parse-error(regexp, "A sub-regexp that matches the empty string was quantified.");
+ parse-error(regexp, "A subpattern that matches the empty string was quantified.");
else
values(optimized-regexp,
parse-info.current-group-number,
@@ -217,6 +247,9 @@
define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
=> (result :: false-or(<parsed-regexp>))
+ // I think this breaks when parse-atom returns #f and then we quantify that.
+ // I added some regexes to regression-tests.txt starting with /a()b/ that I
+ // hope will exercise that case. --cgay
let atom = parse-atom(s, info);
let char = lookahead(s);
select (char by \=)
@@ -321,21 +354,119 @@
end select;
end method parse-atom;
+// Parse a subpattern, a.k.a. "group". i.e., something delimited by parens.
+// The parse string is pointing at the character after the '('.
+//
define inline function parse-group
- (s :: <parse-string>, info :: <parse-info>)
- => (mark :: <mark>)
- info.current-group-number := info.current-group-number + 1;
- let this-group = info.current-group-number;
- let regexp = parse-regexp(s, info);
- if (lookahead(s) = ')')
- consume(s); // Consume ')'
- make(<mark>, child: regexp, group: this-group)
+ (str :: <parse-string>, info :: <parse-info>)
+ => (mark :: false-or(<parsed-regexp>))
+ let char = lookahead(str);
+ if (char == '?')
+ consume(str);
+ parse-extended-group(str, info)
else
- parse-error(s.parse-string, "Unbalanced parens in regexp (index = %s).",
- s.parse-index);
- end;
+ parse-simple-group(str, info, #t, #f)
+ end
end function parse-group;
+// Just saw "(?" so we need to parse a group with extended options.
+//
+define inline function parse-extended-group
+ (str :: <parse-string>, info :: <parse-info>)
+ => (mark :: false-or(<parsed-regexp>))
+ let char = lookahead(str);
+ consume(str);
+ select (char)
+ 'P' => // (?P named group constructs
+ let char = lookahead(str);
+ consume(str);
+ if (char == '=')
+ not-yet-implemented("(?P=name) construct");
+ elseif (char = '<')
+ parse-simple-group(str, info, #t, parse-group-name(str, info))
+ else
+ parse-error(str.parse-string,
+ "Invalid named group syntax (index = %s).",
+ str.parse-index);
+ end;
+
+ ':' => // (?: doesn't save the group
+ parse-simple-group(str, info, #f, #f);
+
+ '#' => // (?# for comments
+ while (lookahead(str) & lookahead(str) ~== ')')
+ consume(str);
+ end;
+ if (~ lookahead(str))
+ parse-error(str.parse-string, "Unterminated (?# comment.");
+ else
+ #f
+ end;
+
+ otherwise =>
+ // See the Python re docs for what all these do.
+ if (member?(char, "iLmsux#=!<("))
+ not-yet-implemented("'(?%c' subpattern construct", lookahead(str));
+ else
+ parse-error(str.parse-string, "Invalid (? construct at index %s.",
+ str.parse-index);
+ end;
+ end select
+end function parse-extended-group;
+
+// Just saw "(?P<", so parse the name of this named group.
+//
+define function parse-group-name
+ (str :: <parse-string>, info :: <parse-info>)
+ => (name :: <string>)
+ let start-index = str.parse-index;
+ while (lookahead(str) & lookahead(str) ~== '>')
+ consume(str);
+ end;
+ if (lookahead(str) == '>')
+ consume(str);
+ copy-sequence(str.parse-string, start: start-index, end: str.parse-index - 1)
+ else
+ parse-error(str.parse-string, "Unterminated named group name at index %s.",
+ str.parse-index);
+ end
+end function parse-group-name;
+
+// Parse a group/subpattern, possibly after having already parsed any
+// options given via "(?...".
+//
+define inline function parse-simple-group
+ (str :: <parse-string>,
+ info :: <parse-info>,
+ save-group? :: <boolean>,
+ group-name :: false-or(<string>))
+ => (mark :: false-or(<parsed-regexp>))
+ if (save-group?)
+ info.current-group-number := info.current-group-number + 1;
+ end;
+ let regexp = parse-regexp(str, info);
+ if (lookahead(str) ~== ')')
+ parse-error(str.parse-string, "Unbalanced parens in regexp (index = %s).",
+ str.parse-index);
+ else
+ consume(str);
+ if (~ save-group?)
+ regexp
+ else
+ if (group-name)
+ if (has-named-group?(info, group-name))
+ parse-error(str.parse-string,
+ "Duplicate group name (%s) at index %s.",
+ group-name, str.parse-index);
+ else
+ info.group-number-to-name[info.current-group-number] := group-name;
+ end;
+ end;
+ make(<mark>, child: regexp, group: info.current-group-number)
+ end
+ end
+end function parse-simple-group;
+
// This just does a quick scan to find the closing ] and then lets
// make(<character-set>) do the real parsing.
//
@@ -542,8 +673,8 @@
// pathological regexp:
//
// First, realize that pathological regexps stem from infinitely
-// quantifying subregexps that could match the empty string. So what
-// we do is find this subregexps, and perform the following
+// quantifying subpatterns that could match the empty string. So what
+// we do is find this subpattern, and perform the following
// transformation:
//
// case (type of regexp)
Modified: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- trunk/libraries/regular-expressions/regex.dylan (original)
+++ trunk/libraries/regular-expressions/regex.dylan Fri Nov 30 17:04:28 2007
@@ -108,7 +108,10 @@
end if;
if (matched?)
let regexp-match = make(<regexp-match>, regular-expression: pattern);
+ let group-number-to-name :: <table> = pattern.group-number-to-name;
for (index from 0 below marks.size by 2)
+ let group-number = floor/(index, 2);
+ let group-name = element(group-number-to-name, group-number, default: #f);
let bpos = marks[index];
let epos = marks[index + 1];
if (bpos & epos)
@@ -117,10 +120,12 @@
let text = copy-sequence(substring.entire-string,
start: substring.start-index + bpos,
end: substring.start-index + epos);
- add-group(regexp-match, make(<match-group>, text: text, start: bpos, end: epos));
+ add-group(regexp-match,
+ make(<match-group>, text: text, start: bpos, end: epos),
+ group-name);
else
// This group wasn't matched.
- add-group(regexp-match, #f)
+ add-group(regexp-match, #f, group-name);
end;
end;
regexp-match
@@ -166,8 +171,9 @@
end class <regexp-match>;
define method add-group
- (match :: <regexp-match>, group :: false-or(<match-group>),
- #key name :: false-or(<string>))
+ (match :: <regexp-match>,
+ group :: false-or(<match-group>),
+ name :: false-or(<string>))
=> (match :: <regexp-match>)
add!(match.groups-by-position, group);
if (name)
@@ -210,12 +216,12 @@
=> (text :: false-or(<string>),
start-index :: false-or(<integer>),
end-index :: false-or(<integer>))
- let index :: <integer> = element(match.groups-by-name, group, default: #f);
- if (index)
- regexp-match-group(match, index)
+ let group = element(match.groups-by-name, group, default: #f);
+ if (group)
+ values(group.group-text, group.group-start, group.group-end)
else
signal(make(<invalid-match-group>,
format-string: "There is no group named %=.",
format-arguments: list(group)));
- end;
+ end
end method regexp-match-group;
Modified: trunk/libraries/regular-expressions/tests/regression-tests.txt
==============================================================================
--- trunk/libraries/regular-expressions/tests/regression-tests.txt (original)
+++ trunk/libraries/regular-expressions/tests/regression-tests.txt Fri Nov 30 17:04:28 2007
@@ -20,3 +20,9 @@
b]b
No match
+/a()b/
+
+/a()+b/
+
+/a(#blah)b/
+
More information about the chatter
mailing list