[Gd-chatter] r11414 - in trunk/libraries/regular-expressions: . tests
cgay at gwydiondylan.org
cgay at gwydiondylan.org
Mon Jun 18 03:27:22 CEST 2007
Author: cgay
Date: Mon Jun 18 03:27:19 2007
New Revision: 11414
Added:
trunk/libraries/regular-expressions/regex.dylan (contents, props changed)
Modified:
trunk/libraries/regular-expressions/gd-library.dylan
trunk/libraries/regular-expressions/gd-regular-expressions.lid
trunk/libraries/regular-expressions/interface.dylan
trunk/libraries/regular-expressions/match.dylan
trunk/libraries/regular-expressions/od-library.dylan
trunk/libraries/regular-expressions/od-regular-expressions.lid
trunk/libraries/regular-expressions/parse.dylan
trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
Log:
job: 7357
* Added a new API module, "regex".
* Improve error messages a bit.
* Parsing "\\" (unterminated escape) and "[..." now signal <invalid-regex>
* Parsing "" now signals <invalid-regex>. Python allows this, so I left
a failing test for it in the test suite.
* Improved min/max quantifier parsing a bit. {m,n}, {m,}, {,n}, {m}, {},
and {,} are all valid now.
Modified: trunk/libraries/regular-expressions/gd-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/gd-library.dylan (original)
+++ trunk/libraries/regular-expressions/gd-library.dylan Mon Jun 18 03:27:19 2007
@@ -32,32 +32,57 @@
//======================================================================
+// Added regex module with new API. --cgay, June 2007
+
define library regular-expressions
- use dylan;
- use collection-extensions;
+ use common-dylan;
use string-extensions;
use table-extensions;
export
- regular-expressions;
+ regex, // new API
+ regular-expressions; // old API
end library regular-expressions;
-define module regular-expressions
- use dylan;
- use extensions;
- use string-conversions;
- use character-type;
- use string-hacking;
- use subseq;
- use %do-replacement;
- use %parse-string;
- use substring-search;
- use table-extensions, import: { string-hash };
- export
+define module regex // new API module
+ create
+ compile-regex,
+ regex-search,
+ <regex>,
+ <invalid-regex>,
+ invalid-regex-pattern,
+ <regex-match>, // results of a successful search
+ regex-match-group,
+ regex-match-group-count,
+ group-start,
+ group-end,
+ group-text,
+ <invalid-match-group>;
+end module regex;
+
+define module regular-expressions // old API module
+ create
regexp-position, make-regexp-positioner,
+ regexp-match,
regexp-replace, make-regexp-replacer,
- regexp-match, regexp-matches,
translate, make-translator,
split, make-splitter,
join,
- <illegal-regexp>;
+ <illegal-regexp>,
+ regexp-pattern;
+
+ create
+ split-string;
end module regular-expressions;
+
+define module regular-expressions-impl
+ use common-dylan,
+ exclude: { split };
+ use string-conversions;
+ use character-type;
+ use string-hacking;
+ use %do-replacement;
+ use %parse-string;
+ use substring-search;
+ use regular-expressions; // API module
+ use regex;
+end module regular-expressions-impl;
Modified: trunk/libraries/regular-expressions/gd-regular-expressions.lid
==============================================================================
--- trunk/libraries/regular-expressions/gd-regular-expressions.lid (original)
+++ trunk/libraries/regular-expressions/gd-regular-expressions.lid Mon Jun 18 03:27:19 2007
@@ -4,3 +4,4 @@
match
parse
interface
+ regex
Modified: trunk/libraries/regular-expressions/interface.dylan
==============================================================================
--- trunk/libraries/regular-expressions/interface.dylan (original)
+++ trunk/libraries/regular-expressions/interface.dylan Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module: regular-expressions
+module: regular-expressions-impl
author: Nick Kramer (nkramer at cs.cmu.edu)
synopsis: This provides a useable interface for users. Functions
defined outside this file are really too strange and quirky
@@ -58,6 +58,11 @@
// parse creating a <character-set>, which must be either case
// sensitive or case insensitive)
//
+
+// This caching scheme fails if we later introduce the ability to change
+// attributes such as case-sensitivity mid-parse, the way (I believe) perl
+// does? --cgay
+
// ### Currently, only regexp-position uses this cache, because the
// other functions are still using make-regexp-positioner. With
// caching, that make-regexp-whatever stuff should probably go.
@@ -101,23 +106,23 @@
=> (equal? :: <function>, hash :: <function>);
values(method (key1 :: <cache-key>, key2 :: <cache-key>) // equal?
=> res :: <boolean>;
- key1.regexp-string = key2.regexp-string
+ key1.regexp-string == key2.regexp-string
& key1.character-set-type == key2.character-set-type;
end method,
- method (key :: <cache-key>, initial-state)
- => (id :: <integer>, state); // hash()
- let (string-id, string-state)
- = string-hash(key.regexp-string, initial-state);
+ method (key :: <cache-key>, initial-state) => (id :: <integer>, state); // hash()
+ let (string-id, string-state) = object-hash(key.regexp-string, initial-state);
let (set-type-id, set-type-state)
= object-hash(key.character-set-type, string-state);
- let id = merge-hash-ids(string-id, set-type-id, ordered: #t);
- values(id, set-type-state);
+ values(merge-hash-ids(string-id, set-type-id, ordered: #t), set-type-state);
end method);
end method table-protocol;
// *regexp-cache* -- internal
//
// The only instance of <regexp-cache>. ### Not threadsafe.
+//
+// Technically not thread safe, but does it matter? Worst case seems to
+// be a duplicated regexp parse. --cgay
//
define constant *regexp-cache* = make(<regexp-cache>);
@@ -127,16 +132,17 @@
// parses it and adds it to the cache.
//
define inline function parse-or-use-cached
- (regexp :: <string>, character-set-type :: <class>)
+ (regexp :: <string>, parse-info :: <parse-info>)
=> (parsed-regexp :: <parsed-regexp>, last-group :: <integer>);
let key = make(<cache-key>, regexp-string: regexp,
- character-set-type: character-set-type);
+ character-set-type: parse-info.set-type);
let cached-value = element(*regexp-cache*, key, default: #f);
if (cached-value)
values(cached-value.parse-tree, cached-value.last-group);
else
- let (parsed-regexp, last-group) = parse(regexp, character-set-type);
- *regexp-cache*[key] := make(<cache-element>, parse-tree: parsed-regexp,
+ let (parsed-regexp, last-group) = parse(regexp, parse-info);
+ *regexp-cache*[key] := make(<cache-element>,
+ parse-tree: parsed-regexp,
last-group: last-group);
values(parsed-regexp, last-group);
end if;
@@ -155,13 +161,8 @@
=> (regexp-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
let substring = make(<substring>, string: big, start: big-start,
end: big-end | big.size);
- let char-set-class = if (case-sensitive)
- <case-sensitive-character-set>;
- else
- <case-insensitive-character-set>;
- end if;
let (parsed-regexp, last-group)
- = parse-or-use-cached(regexp, char-set-class);
+ = parse-or-use-cached(regexp, make-parse-info(case-sensitive: case-sensitive));
let (matched, marks)
= if (parsed-regexp.is-anchored?)
@@ -220,6 +221,8 @@
apply(values, result)
end;
+// #if (have-free-time)
+/*
// regexp-matches -- exported
//
// A more convenient form of regexp-position. Usually you want
@@ -232,28 +235,47 @@
(big :: <string>, regexp :: <string>,
#key start: start-index :: <integer> = 0,
end: end-index :: false-or(<integer>),
- case-sensitive :: <boolean> = #f)
-
- let (regexp-start, lemon, #rest marks)
+ case-sensitive :: <boolean> = #f,
+ groups :: false-or(<sequence>))
+ => (#rest group-strings :: false-or(<string>));
+ if (~groups)
+ error("Mandatory keyword groups: not used in call to regexp-matches");
+ end if;
+ let (#rest marks)
= regexp-position(big, regexp, start: start-index, end: end-index,
case-sensitive: case-sensitive);
-
- let return-size = floor/(marks.size, 2);
- let return = make(<vector>, size: return-size, fill: #f);
- if (regexp-start)
- // all groups associate by index in the result
-
- for (index from 0 below return-size)
- let pos = index * 2;
- if (element(marks, pos, default: #f))
- // "14 0", "4 5", "7 8", "9 10" for "this is a test"
- //return[index] := concatenate(integer-to-string(marks[pos]), concatenate(" ", integer-to-string(marks[pos + 1])));
- return[index] := copy-sequence(big, start: marks[pos], end: marks[pos + 1]);
- end if;
+ let return-val = make(<vector>, size: groups.size, fill: #f);
+ for (index from 0 below return-val.size)
+ let group-start = groups[index] * 2;
+ let group-end = group-start + 1;
+ if (element(marks, group-start, default: #f))
+ return-val[index] := copy-sequence(big, start:
+
+ let sz = floor/(marks.size, 2);
+ let return = make(<vector>, size: sz, fill: #f);
+ for (index from 0 below sz)
+ let pos = index * 2;
+ if (element(marks, pos, default: #f))
+ return[index] := copy-sequence(big, start: marks[pos],
+ end: marks[pos + 1]);
+ end if;
+ end for;
+ if (matches)
+ let return = make(<vector>, size: matches.size * 2);
+ for (raw-pos in matches, index from 0)
+ let src-pos = raw-pos * 2;
+ let dest-pos = index * 2;
+ return[dest-pos] := element(marks, src-pos, default: #f);
+ return[dest-pos + 1] := element(marks, src-pos + 1, default: #f);
end for;
+ apply(values, return);
+ else
+
+ apply(values, marks);
end if;
- apply(values, return);
-end function;
+
+// #endif
+*/
// Functions based on regexp-position
@@ -446,7 +468,7 @@
end method;
end function make-splitter;
-// Used by split. Not exported.
+// Used by split. Not exported. (Yes it is. --cgay)
//
define function split-string
(positioner :: <function>, input :: <string>, start :: <integer>,
Modified: trunk/libraries/regular-expressions/match.dylan
==============================================================================
--- trunk/libraries/regular-expressions/match.dylan (original)
+++ trunk/libraries/regular-expressions/match.dylan Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module: regular-expressions
+module: regular-expressions-impl
author: Nick Kramer (nkramer at cs.cmu.edu)
synopsis: This takes a parsed regular expression and tries to find a match
for it.
Modified: trunk/libraries/regular-expressions/od-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/od-library.dylan (original)
+++ trunk/libraries/regular-expressions/od-library.dylan Mon Jun 18 03:27:19 2007
@@ -32,33 +32,56 @@
//======================================================================
+// Added regex module with new API. --cgay, June 2007
+
define library regular-expressions
- use functional-dylan;
+ use common-dylan;
use string-extensions;
export
- regular-expressions;
+ regex, // new API
+ regular-expressions; // old API
end library regular-expressions;
-define module regular-expressions
- use functional-dylan, exclude: { split };
- use dylan-extensions, import: { string-hash };
- // use extensions;
- use string-conversions;
- use character-type;
- use string-hacking;
- // use subseq;
- use %do-replacement;
- use %parse-string;
- use substring-search;
- export
+define module regex // new API module
+ create
+ compile-regex,
+ regex-search,
+ <regex>,
+ <invalid-regex>,
+ invalid-regex-pattern,
+ <regex-match>, // results of a successful search
+ regex-match-group,
+ regex-match-group-count,
+ group-start,
+ group-end,
+ group-text,
+ <invalid-match-group>;
+end module regex;
+
+define module regular-expressions // old API module
+ create
regexp-position, make-regexp-positioner,
+ regexp-match,
regexp-replace, make-regexp-replacer,
- regexp-match, regexp-matches,
translate, make-translator,
split, make-splitter,
join,
- <illegal-regexp>;
+ <illegal-regexp>,
+ regexp-pattern;
- export
+ create
split-string;
end module regular-expressions;
+
+define module regular-expressions-impl
+ use common-dylan,
+ exclude: { split };
+ use string-conversions;
+ use character-type;
+ use string-hacking;
+ use %do-replacement;
+ use %parse-string;
+ use substring-search;
+ use regular-expressions; // API module
+ use regex;
+end module regular-expressions-impl;
Modified: trunk/libraries/regular-expressions/od-regular-expressions.lid
==============================================================================
--- trunk/libraries/regular-expressions/od-regular-expressions.lid (original)
+++ trunk/libraries/regular-expressions/od-regular-expressions.lid Mon Jun 18 03:27:19 2007
@@ -3,4 +3,4 @@
match.dylan
parse.dylan
interface.dylan
-
+ regex.dylan
Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan (original)
+++ trunk/libraries/regular-expressions/parse.dylan Mon Jun 18 03:27:19 2007
@@ -1,4 +1,4 @@
-module: regular-expressions
+module: regular-expressions-impl
author: Nick Kramer (nkramer at cs.cmu.edu)
copyright: see below
@@ -98,11 +98,34 @@
constant slot group-number :: <integer>, required-init-keyword: #"group";
end class <parsed-backreference>;
+// Note: I'm pretty sure <simple-error> won't work in GD. --cgay
+define class <regex-error> (<simple-error>)
+end class <regex-error>;
+
+define class <illegal-regexp> (<regex-error>)
+ constant slot regexp-pattern :: <string>,
+ required-init-keyword: #"pattern";
+end class <illegal-regexp>;
+
+define sealed domain make (singleton(<illegal-regexp>));
+define sealed domain initialize (<illegal-regexp>);
+
+define function parse-error
+ (pattern :: <string>, format-string :: <string>, #rest format-args)
+ let msg = apply(format-to-string, format-string, format-args);
+ signal(make(<illegal-regexp>,
+ format-string: "Invalid regular expression: %=. %s",
+ format-arguments: list(pattern, msg),
+ pattern: pattern));
+end function parse-error;
+
// <parse-info> contains some information about the current regexp
// being parsed. Using a structure is slightly nicer than having
// global variables..
//
define class <parse-info> (<object>)
+ // Name this has-backreferences, for consistency with the other slots?
+ // Add ? to all the has-* slots. --cgay
slot backreference-used :: <boolean>, init-value: #f;
// Whether or not the function includes \1, \2, etc in the regexp.
// This is different from return-marks, which determines whether the
@@ -113,36 +136,42 @@
constant slot set-type :: <class>, required-init-keyword: #"set-type";
end class <parse-info>;
-define class <illegal-regexp> (<error>)
- constant slot regular-expression :: <string>,
- required-init-keyword: #"regexp";
-end class <illegal-regexp>;
-
-define sealed domain make (singleton(<illegal-regexp>));
-define sealed domain initialize (<illegal-regexp>);
-
-// cgay todo
-/* KJP: Doesn't work this way in Functional Developer.
-define sealed method report-condition (cond :: <illegal-regexp>, stream) => ();
- condition-format(stream, "Illegal regular expression: \n"
- "A sub-regexp that matches the empty string has"
- " been quantified in\n %s",
- cond.regular-expression);
-end method report-condition;
-*/
-//ignorable(regular-expression);
-
-define method parse (regexp :: <string>, character-set-type :: <class>)
- => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>,
- backrefs? :: <boolean>, alternatives? :: <boolean>,
- quantifiers? :: <boolean>);
- let parse-info = make(<parse-info>, set-type: character-set-type);
+define function make-parse-info
+ (#key case-sensitive :: <boolean> = #t,
+ verbose :: <boolean> = #f,
+ multi-line :: <boolean> = #f,
+ dot-matches-all :: <boolean> = #f)
+ => (info :: <parse-info>)
+ local method nyi (option-name)
+ signal(make(<regex-error>,
+ format-string: "The '%s' option is not yet implemented.",
+ format-arguments: list(option-name)));
+ end;
+ verbose & nyi("verbose");
+ multi-line & nyi("multi-line");
+ dot-matches-all & nyi("dot-matches-all");
+ let char-set-type
+ = if (case-sensitive)
+ <case-sensitive-character-set>
+ else
+ <case-insensitive-character-set>
+ end;
+ make(<parse-info>, set-type: char-set-type)
+end function make-parse-info;
+
+define method parse
+ (regexp :: <string>, parse-info :: <parse-info>)
+ => (parsed-regexp :: <parsed-regexp>,
+ last-group :: <integer>,
+ backrefs? :: <boolean>,
+ alternatives? :: <boolean>,
+ quantifiers? :: <boolean>)
let parse-string = make(<parse-string>, string: regexp);
let parse-tree = make(<mark>, group: 0,
child: parse-regexp(parse-string, parse-info));
let optimized-regexp = optimize(parse-tree);
if (optimized-regexp.pathological?)
- signal(make(<illegal-regexp>, regexp: regexp));
+ parse-error(regexp, "A sub-regexp that matches the empty string was quantified.");
else
values(optimized-regexp,
parse-info.current-group-number,
@@ -155,7 +184,9 @@
define method parse-regexp (s :: <parse-string>, info :: <parse-info>)
=> parsed-regexp :: <parsed-regexp>;
let alternative = parse-alternative(s, info);
- if (lookahead(s) = '|')
+ if (~alternative)
+ parse-error(s.parse-string, "");
+ elseif (lookahead(s) = '|')
info.has-alternatives := #t;
make(<union>, left: alternative, right: parse-regexp(consume(s), info));
else
@@ -163,8 +194,9 @@
end if;
end method parse-regexp;
-define method parse-alternative (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+define method parse-alternative
+ (s :: <parse-string>, info :: <parse-info>)
+ => (re :: false-or(<parsed-regexp>))
let term = parse-quantified-atom(s, info);
if (member?(lookahead(s), #(#f, '|', ')')))
term;
@@ -174,7 +206,7 @@
end method parse-alternative;
define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: false-or(<parsed-regexp>);
+ => (result :: false-or(<parsed-regexp>))
let atom = parse-atom(s, info);
let char = lookahead(s);
select (char by \=)
@@ -196,53 +228,52 @@
'{' =>
info.has-quantifiers := #t;
consume(s);
- let first-string = make(<deque>);
- let second-string = make(<deque>);
- let has-comma = #f;
- for (c = lookahead(s) then lookahead(s), until: c = '}')
- consume(s);
- if (c = ',')
- has-comma := #t;
- elseif (has-comma)
- push-last(second-string, c);
- else
- push-last(first-string, c);
- end if;
- end for;
- consume(s); // Eat closing brace
- let first-num = string-to-integer(as(<byte-string>, first-string));
- make(<quantified-atom>, atom: atom,
- min: first-num,
- max: if (~has-comma)
- first-num
- elseif (empty?(second-string))
- #f
- else
- string-to-integer(as(<byte-string>, second-string))
- end if);
+ parse-minmax-quantifier(atom, s);
otherwise =>
atom;
end select;
end method parse-quantified-atom;
+// {m,n}, {m,}, {,n}, {m}, {}, and {,} are all valid.
+// m defaults to 0 and n defaults to #f (unlimited).
+define method parse-minmax-quantifier
+ (atom :: <parsed-regexp>, s :: <parse-string>) => (qatom :: <quantified-atom>)
+ local method parse-integer () => (int :: false-or(<integer>))
+ let digits = make(<deque>);
+ while (lookahead(s) & digit?(lookahead(s)))
+ push-last(digits, lookahead(s));
+ consume(s);
+ end;
+ ~empty?(digits) & string-to-integer(as(<byte-string>, digits));
+ end method parse-integer;
+ let qmin = parse-integer() | 0;
+ let qmax = #f;
+ if (lookahead(s) = ',')
+ consume(s);
+ qmax := parse-integer();
+ else
+ qmax := qmin;
+ end;
+ if (lookahead(s) ~= '}')
+ parse-error(s.parse-string,
+ "Close brace expected in {m,n} quantifier (index = %s).",
+ s.parse-index);
+ end;
+ consume(s);
+ make(<quantified-atom>, atom: atom, min: qmin, max: qmax);
+end method parse-minmax-quantifier;
+
define method parse-atom (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: false-or(<parsed-regexp>);
+ => (regexp :: false-or(<parsed-regexp>))
let char = lookahead(s);
select (char)
'(' =>
consume(s); // Consume beginning paren
- info.current-group-number := info.current-group-number + 1;
- let this-group = info.current-group-number;
- let regexp = parse-regexp(s, info);
- if (lookahead(s) ~= ')')
- error("Unbalanced parens in regexp");
- end if;
- consume(s); // Consume end paren
- make(<mark>, child: regexp, group: this-group);
+ parse-group(s, info);
')' =>
- #f; // Need something to terminate upon seeing a close paren
+ #f; // Need something to terminate upon seeing a close paren
#f =>
#f; // Signal error? (end of stream)
@@ -252,29 +283,13 @@
'\\' =>
consume(s); // Consume the backslash
+ // Perhaps add support for a different escape character to aid readability.
+ // The escape character could be specified in the <parse-info>. --cgay
parse-escaped-character(s, info);
'[' =>
consume(s); // Eat the opening brace
- let set-string = make(<deque>); // Need something that'll
- // preserve the right ordering
- for (char = lookahead(s) then lookahead(s), until: char == ']')
- consume(s); // eat char
- if (char ~== '\\')
- push-last(set-string, char);
- else
- let char2 = lookahead(s);
- consume(s); // Eat escaped char
- if (char2 == ']')
- push-last(set-string, ']');
- else
- push-last(set-string, '\\');
- push-last(set-string, char2);
- end if;
- end if;
- end for;
- consume(s); // Eat ending brace
- make(<parsed-set>, set: make(info.set-type, description: set-string));
+ parse-character-set(s, info);
'.' =>
consume(s);
@@ -297,10 +312,57 @@
end select;
end method parse-atom;
+define inline function parse-group
+ (s :: <parse-string>, info :: <parse-info>)
+ => (mark :: <mark>)
+ info.current-group-number := info.current-group-number + 1;
+ let this-group = info.current-group-number;
+ let regexp = parse-regexp(s, info);
+ if (lookahead(s) = ')')
+ consume(s); // Consume ')'
+ make(<mark>, child: regexp, group: this-group)
+ else
+ parse-error(s.parse-string, "Unbalanced parens in regexp (index = %s).",
+ s.parse-index);
+ end;
+end function parse-group;
+
+// This just does a quick scan to find the closing ] and then lets
+// make(<character-set>) do the real parsing.
+//
+define inline function parse-character-set
+ (s :: <parse-string>, info :: <parse-info>)
+ => (set :: <parsed-set>)
+ let set-string = make(<deque>); // Need something that'll
+ // preserve the right ordering
+ let start-index = s.parse-index;
+ for (char = lookahead(s) then lookahead(s),
+ until: char == ']')
+ consume(s); // eat char
+ if (~char)
+ parse-error(s.parse-string,
+ "Unterminated character set at index %d.",start-index);
+ elseif (char ~== '\\')
+ push-last(set-string, char);
+ else
+ let char2 = lookahead(s);
+ consume(s); // Eat escaped char
+ if (char2 == ']')
+ push-last(set-string, ']');
+ else
+ push-last(set-string, '\\');
+ push-last(set-string, char2);
+ end if;
+ end if;
+ end for;
+ consume(s); // Eat ending brace
+ make(<parsed-set>, set: make(info.set-type, description: set-string));
+end function parse-character-set;
+
define constant any-char
= make(<case-sensitive-character-set>, description: "^\n");
-// The useful definitions of all these is in as(<character-set>)
+// The useful definitions of all these are in as(<character-set>).
//
define constant digit-chars
= make(<case-sensitive-character-set>, description: "\\d");
@@ -328,12 +390,20 @@
(s :: <parse-string>, info :: <parse-info>)
=> parsed-regexp :: <parsed-regexp>;
let next-char = lookahead(s);
+ if (~next-char)
+ parse-error(s.parse-string,
+ "Unterminated escape sequence at index %d.",
+ s.parse-index - 1)
+ end;
consume(s);
select (next-char)
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9' =>
info.backreference-used := #t;
make(<parsed-backreference>, group: digit-to-integer(next-char));
-
+
+ // Hmm. Why would you write \\n in your regex instead of \n? It has the
+ // same effect. Also, what about the rest of the Dylan character escapes?
+ // --cgay
'n' => make(<parsed-character>, character: '\n'); // Newline
't' => make(<parsed-character>, character: '\t'); // Tab
'f' => make(<parsed-character>, character: '\f'); // Formfeed
Added: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/regex.dylan Mon Jun 18 03:27:19 2007
@@ -0,0 +1,199 @@
+Module: regular-expressions-impl
+Author: Carl Gay
+Synopsis: A new API for the regular-expressions library
+
+
+// Rename a few things...
+define constant <regex> = <parsed-regexp>;
+define constant <invalid-regex> = <illegal-regexp>;
+define constant invalid-regex-pattern = regexp-pattern;
+
+
+// Compile the given string into an optimized regular expression.
+//
+// @param case-sensitive -- Whether to be case sensivite.
+// @param verbose -- If true, allows you to write regular expressions that
+// are easier to read by including whitespace and comments in them that
+// will be ignored.
+//
+// @param multi-line -- If true, '^' matches at the beginning of the string and
+// at the beginning of each line (immediately following each newline); and '$'
+// matches at the end of the string and at the end of each line (immediately
+// preceding each newline). By default, "^" matches only at the beginning of
+// the string, and "$" only at the end of the string.
+//
+// @param dot-matches-newline -- Normally '.' matches any character except for
+// newline. If this parameter is true '.' matches newline as well.
+//
+// This function signals <invalid-regex> if the regular expression is invalid.
+//
+define sealed generic compile-regex
+ (string :: <string>,
+ #key case-sensitive :: <boolean> = #t,
+ verbose :: <boolean> = #f,
+ multi-line :: <boolean> = #f,
+ dot-matches-all :: <boolean> = #f)
+ => (regex :: <regex>);
+
+define method compile-regex
+ (string :: <string>,
+ #key case-sensitive :: <boolean> = #t,
+ verbose :: <boolean> = #f,
+ multi-line :: <boolean> = #f,
+ dot-matches-all :: <boolean> = #f)
+ => (regex :: <regex>)
+ parse(string,
+ make-parse-info(case-sensitive: case-sensitive,
+ verbose: verbose,
+ multi-line: multi-line,
+ dot-matches-all: dot-matches-all))
+end;
+
+
+// Returns a <regex-match> containing info about a successful match, or #f if
+// no match was found.
+//
+// @param big -- The string in which to search.
+// @param pattern -- The pattern to search for. If not a <regex>, it will be
+// compiled first with compile-regex (implying that <invalid-regex> may be
+// signalled), using the defaults for the keyword arguments. If you wish
+// to override them, call compile-regex directly.
+// @param anchored -- Whether or not the search should be anchored at the start
+// position. This is useful because "^..." will only match at the beginning
+// of a string, or after \n if the regex was compiled with multi-line = #t.
+// @param start -- Where to begin the search.
+// @param end -- Where to stop searching.
+//
+// todo -- Should $ anchor at the provided end position or at the end of the string?
+//
+define sealed generic regex-search
+ (big :: <string>, pattern :: <object>,
+ #key anchored :: <boolean> = #f,
+ start :: <integer> = 0,
+ end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>));
+
+define method regex-search
+ (big :: <string>, pattern :: <string>,
+ #key anchored :: <boolean> = #f,
+ start :: <integer> = 0,
+ end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>))
+ regex-search(big, compile-regex(pattern),
+ anchored: anchored, start: start, end: _end)
+end method regex-search;
+
+define method regex-search
+ (big :: <string>, pattern :: <regex>,
+ #key anchored :: <boolean> = #f,
+ start :: <integer> = 0,
+ end: _end :: <integer> = big.size)
+ => (match :: false-or(<regex-match>))
+ // Copied from regexp-position with some mods to match our interface.
+ // Unlike regexp-position there is no caching. If you don't want to
+ // recompile your regex each time, compile it explicitly with compile-regex
+ // and save it.
+ let substring = make(<substring>, string: big, start: start, end: _end);
+ let case-sensitive? = #t;
+ let (matched?, marks)
+ = if (pattern.is-anchored?)
+ anchored-match-root?(pattern, substring, case-sensitive?, last-group + 1, #f);
+ else
+ let initial = pattern.initial-substring;
+ let searcher = ~initial.empty?
+ & make-substring-positioner(initial, case-sensitive: case-sensitive?);
+ match-root?(pattern, substring, case-sensitive?, last-group + 1, searcher);
+ end if;
+ if (matched?)
+ let regex-match = make(<regex-match>);
+ for (index from 0 by 2)
+ let bpos = marks[index];
+ let epos = marks[index + 1];
+ // It would be nice to make <substring> a real sequence, and possibly unify
+ // it with the substring implementation in Koala.
+ let text = copy-sequence(substring.entire-string,
+ start: substring.start-index + bpos,
+ end: substring.end-index + epos);
+ add-group(regex-match, make(<match-group>, text: text, start: bpos, end: epos));
+ end;
+ regex-match
+ else
+ #f
+ end
+end method regex-search;
+
+// This has methods for group :: <string> and group :: <integer>.
+// Group zero is always the entire match.
+define sealed generic regex-match-group
+ (match :: <regex-match>, group :: <object>)
+ => (text :: false-or(<string>),
+ start-index :: false-or(<integer>),
+ end-index :: false-or(<integer>));
+
+// How many groups matched? There will always be at least one; the entire match.
+// (Maybe better to provide a way to iterate over the groups instead, but this
+// should be rarely used since you generally know what your groups are.)
+//
+define sealed generic regex-match-group-count
+ (match :: <regex-match>) => (count :: <integer>);
+
+
+define sealed class <match-group> (<object>)
+ constant slot group-text :: <string>,
+ required-init-keyword: text:;
+ constant slot group-start :: <integer>,
+ required-init-keyword: start:;
+ constant slot group-end :: <integer>,
+ required-init-keyword: end:;
+end class <match-group>;
+
+
+define sealed class <regex-match> (<object>)
+ // Groups by position. Zero is the entire match.
+ constant slot group-vector :: <stretchy-vector> = make(<stretchy-vector>);
+ // Maps group names to positions.
+ constant slot group-table :: <string-table> = make(<string-table>);
+end class <regex-match>;
+
+define method add-group
+ (match :: <regex-match>, group :: <match-group>,
+ #key name :: false-or(<string>))
+ => (match :: <regex-match>)
+ add!(match.group-vector, group);
+ if (name)
+ match.group-table[name] := group;
+ end;
+ match
+end;
+
+define sealed class <invalid-match-group> (<regex-error>)
+end class <invalid-match-group>;
+
+define method regex-match-group
+ (match :: <regex-match>, group :: <integer>)
+ => (text :: false-or(<string>),
+ start-index :: false-or(<integer>),
+ end-index :: false-or(<integer>))
+ if (0 <= group < match.group-vector.size)
+ match.group-vector[group]
+ else
+ signal(make(<invalid-match-group>,
+ format-string: "Match group index %d out of bounds. Max group index is %d.",
+ format-arguments: list(group, match.group-vector.size - 1)));
+ end;
+end method regex-match-group;
+
+define method regex-match-group
+ (match :: <regex-match>, group :: <string>)
+ => (text :: false-or(<string>),
+ start-index :: false-or(<integer>),
+ end-index :: false-or(<integer>))
+ let index = element(match.group-table, group, default: #f);
+ if (index)
+ regex-match-group(match, index)
+ else
+ signal(make(<invalid-match-group>,
+ format-string: "There is no group named %=.",
+ format-arguments: list(group)));
+ end;
+end method regex-match-group;
Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan (original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan Mon Jun 18 03:27:19 2007
@@ -8,6 +8,10 @@
end function re/position;
define test atom-test ()
+ // In current code the empty string is an illegal regex, but Python
+ // (and probably perl?) allow it, so I think we should consider that
+ // a bug. --cgay
+ check-no-errors("atom-0", re/position("", ""));
check-equal("atom-1", re/position("a", "a"), #[0, 1]);
check-equal("atom-2", re/position("a", "[a]"), #[0, 1]);
check-equal("atom-3", re/position("ab", "(a)b"), #[0, 2, 0, 1]);
@@ -26,7 +30,7 @@
check-condition("atom-F", <illegal-regexp>, re/position("", "a{,"));
check-condition("atom-G", <illegal-regexp>, re/position("", "[a"));
check-condition("atom-H", <illegal-regexp>, re/position("", "\\"));
- check-equal("atom-tan", "\<65>", "A");
+ //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
end;
More information about the chatter
mailing list