[Gd-chatter] r11510 - in trunk/libraries/regular-expressions: . tests
cgay at gwydiondylan.org
cgay at gwydiondylan.org
Sun Dec 2 15:16:49 CET 2007
Author: cgay
Date: Sun Dec 2 15:16:47 2007
New Revision: 11510
Added:
trunk/libraries/regular-expressions/tests/pcre.dylan (contents, props changed)
Removed:
trunk/libraries/regular-expressions/tests/new-api-test-suite.dylan
trunk/libraries/regular-expressions/tests/old-api-test-suite.dylan
Modified:
trunk/libraries/regular-expressions/interface.dylan
trunk/libraries/regular-expressions/match.dylan
trunk/libraries/regular-expressions/od-library.dylan
trunk/libraries/regular-expressions/parse.dylan
trunk/libraries/regular-expressions/regex.dylan
trunk/libraries/regular-expressions/tests/library.dylan
trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
Log:
Bug: 7357
Got rid of the whole new/old API mess, and some API tweaks.
Modified: trunk/libraries/regular-expressions/interface.dylan
==============================================================================
--- trunk/libraries/regular-expressions/interface.dylan (original)
+++ trunk/libraries/regular-expressions/interface.dylan Sun Dec 2 15:16:47 2007
@@ -1,4 +1,4 @@
-module: regular-expressions-impl
+module: regular-expressions
author: Nick Kramer (nkramer at cs.cmu.edu)
synopsis: This provides a useable interface for users. Functions
defined outside this file are really too strange and quirky
@@ -32,25 +32,22 @@
//
//======================================================================
-// Functions that aren't exported are marked as such. Everything else
-// is exported.
-//
// There are quite a few make-fooer functions hanging around. Now
-// that regexp-position does caching, these are basically useless, but
+// that regex-position does caching, these are basically useless, but
// we've kept them around for backwards compatibility. Unfortunately,
// internally most of the functions are implemented in terms of
-// make-regexp-positioner. To minimize the amount of rewriting, I've
+// make-regex-positioner. To minimize the amount of rewriting, I've
// liberally applied seals and inline declarations so that
-// make-regexp-positioner won't clobber all type information. The
+// make-regex-positioner won't clobber all type information. The
// downside, of course, is that everything's sealed, but hey, no one
-// ever subclassed regexp-position anyway.
+// ever subclassed regex-position anyway.
// Caching
//
-// Parsing a regexp is not cheap, so we cache the parsed regexps and
+// Parsing a regex is not cheap, so we cache the parsed regexs and
// only parse a string if we haven't seen it before. Because in
-// practice almost all regexp strings are string literals, we're free
+// practice almost all regex strings are string literals, we're free
// to choose == or = depending on whatever's fastest. However,
// because a string is parsed differently depending on whether the
// search is case sensitive or not, we also have to keep track of that
@@ -63,116 +60,116 @@
// attributes such as case-sensitivity mid-parse, the way (I believe) perl
// does? --cgay
-// ### Currently, only regexp-position uses this cache, because the
-// other functions are still using make-regexp-positioner. With
-// caching, that make-regexp-whatever stuff should probably go.
+// ### Currently, only regex-position uses this cache, because the
+// other functions are still using make-regex-positioner. With
+// caching, that make-regex-whatever stuff should probably go.
// <cache-key> -- internal
//
-// What we use for keys in the *regexp-cache*.
+// What we use for keys in the *regex-cache*.
//
define class <cache-key> (<object>)
- constant slot regexp-string :: <string>,
- required-init-keyword: #"regexp-string";
+ constant slot regex-string :: <string>,
+ required-init-keyword: #"regex-string";
constant slot character-set-type :: <class>,
required-init-keyword: #"character-set-type";
end class <cache-key>;
// <cache-element> -- internal
//
-// What we use for elements in a *regexp-cache*
+// What we use for elements in a *regex-cache*
//
define class <cache-element> (<object>)
- constant slot parse-tree :: <parsed-regexp>,
+ constant slot parse-tree :: <parsed-regex>,
required-init-keyword: #"parse-tree";
constant slot last-group :: <integer>,
required-init-keyword: #"last-group";
end class <cache-element>;
-// <regexp-cache> -- internal
+// <regex-cache> -- internal
//
// Maps <cache-key> to <cache-element>. ### Ideally, we'd be using
// weak pointers to these strings. In practice, however, most of the
-// regexp strings are literals, so this isn't usually a drawback.
+// regex strings are literals, so this isn't usually a drawback.
//
// This used to compare strings with == rather than =, but this leaks
// lots of memory
//
-define class <regexp-cache> (<table>) end;
+define class <regex-cache> (<table>) end;
-// table-protocol{<regexp-cache>} -- method on imported G.F.
+// table-protocol{<regex-cache>} -- method on imported G.F.
//
-define method table-protocol (table :: <regexp-cache>)
+define method table-protocol (table :: <regex-cache>)
=> (equal? :: <function>, hash :: <function>);
values(method (key1 :: <cache-key>, key2 :: <cache-key>) // equal?
=> res :: <boolean>;
- key1.regexp-string = key2.regexp-string
+ key1.regex-string = key2.regex-string
& key1.character-set-type == key2.character-set-type;
end method,
method (key :: <cache-key>, initial-state) => (id :: <integer>, state); // hash()
- let (string-id, string-state) = object-hash(key.regexp-string, initial-state);
+ let (string-id, string-state) = object-hash(key.regex-string, initial-state);
let (set-type-id, set-type-state)
= object-hash(key.character-set-type, string-state);
values(merge-hash-ids(string-id, set-type-id, ordered: #t), set-type-state);
end method);
end method table-protocol;
-// *regexp-cache* -- internal
+// *regex-cache* -- internal
//
-// The only instance of <regexp-cache>. ### Not threadsafe.
+// The only instance of <regex-cache>. ### Not threadsafe.
//
// Technically not thread safe, but does it matter? Worst case seems to
-// be a duplicated regexp parse. --cgay
+// be a duplicated regex parse. --cgay
//
-define constant *regexp-cache* = make(<regexp-cache>);
+define constant *regex-cache* = make(<regex-cache>);
// parse-or-use-cached -- internal
//
-// Tries to use the cached version of the regexp, and if not possible,
+// Tries to use the cached version of the regex, and if not possible,
// parses it and adds it to the cache.
//
define inline function parse-or-use-cached
- (regexp :: <string>, parse-info :: <parse-info>)
- => (parsed-regexp :: <parsed-regexp>, last-group :: <integer>);
- let key = make(<cache-key>, regexp-string: regexp,
+ (regex :: <string>, parse-info :: <parse-info>)
+ => (parsed-regex :: <parsed-regex>, last-group :: <integer>);
+ let key = make(<cache-key>, regex-string: regex,
character-set-type: parse-info.set-type);
- let cached-value = element(*regexp-cache*, key, default: #f);
+ let cached-value = element(*regex-cache*, key, default: #f);
if (cached-value)
values(cached-value.parse-tree, cached-value.last-group);
else
- let (parsed-regexp, last-group) = parse(regexp, parse-info);
- *regexp-cache*[key] := make(<cache-element>,
- parse-tree: parsed-regexp,
+ let (parsed-regex, last-group) = parse(regex, parse-info);
+ *regex-cache*[key] := make(<cache-element>,
+ parse-tree: parsed-regex,
last-group: last-group);
- values(parsed-regexp, last-group);
+ values(parsed-regex, last-group);
end if;
end function parse-or-use-cached;
-// Regexp positioner stuff
+// regex positioner stuff
// Find the position of a regular expression inside a string. If the
-// regexp is not found, return #f, otherwise return a variable number
+// regex is not found, return #f, otherwise return a variable number
// of marks.
//
-define function regexp-position
- (big :: <string>, regexp :: <string>, #key start: big-start = 0,
+define function regex-position
+ (big :: <string>, regex :: <string>, #key start: big-start = 0,
end: big-end = #f, case-sensitive = #f)
- => (regexp-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
+ => (regex-start :: false-or(<integer>), #rest marks :: false-or(<integer>));
let substring = make(<substring>, string: big, start: big-start,
end: big-end | big.size);
- let (parsed-regexp, last-group)
- = parse-or-use-cached(regexp, make-parse-info(case-sensitive: case-sensitive));
+ let (parsed-regex, last-group)
+ = parse-or-use-cached(regex, make-parse-info(case-sensitive: case-sensitive));
let (matched, marks)
- = if (parsed-regexp.is-anchored?)
- anchored-match-root?(parsed-regexp, substring, case-sensitive,
+ = if (parsed-regex.is-anchored?)
+ anchored-match-root?(parsed-regex, substring, case-sensitive,
last-group + 1, #f);
else
- let initial = parsed-regexp.initial-substring;
+ let initial = parsed-regex.initial-substring;
let searcher = ~initial.empty?
& make-substring-positioner(initial, case-sensitive: case-sensitive);
- match-root?(parsed-regexp, substring, case-sensitive, last-group + 1,
+ match-root?(parsed-regex, substring, case-sensitive, last-group + 1,
searcher);
end if;
if (matched)
@@ -180,31 +177,32 @@
else
#f
end if;
-end function regexp-position;
+end function regex-position;
// Once upon a time, this was how you interfaced to the NFA stuff
// (maximum-compile: #t). That's gone. Now it's just here for
// backwards compatibility. All keywords except case-sensitive are
// now ignored.
//
-define inline function make-regexp-positioner
- (regexp :: <string>,
+define inline function make-regex-positioner
+ (regex :: <string>,
#key byte-characters-only = #f, need-marks = #t, maximum-compile = #f,
case-sensitive = #f)
- => regexp-positioner :: <function>;
+ => regex-positioner :: <function>;
method (big :: <string>, #key start: big-start = 0,
end: big-end = #f)
- => (regexp-start :: false-or(<integer>),
+ => (regex-start :: false-or(<integer>),
#rest marks :: false-or(<integer>));
- regexp-position(big, regexp, case-sensitive: case-sensitive,
+ regex-position(big, regex, case-sensitive: case-sensitive,
start: big-start, end: big-end);
end method;
-end function make-regexp-positioner;
+end function make-regex-positioner;
// returns #f if no match, the matching string on match, and another string or #f
-// for each group in the regexp.
-define method regexp-match(big :: <string>, regex :: <string>) => (#rest results);
- let (#rest marks) = regexp-position(big, regex);
+// for each group in the regex.
+define method regex-match
+ (big :: <string>, regex :: <string>) => (#rest results);
+ let (#rest marks) = regex-position(big, regex);
let result = make(<stretchy-vector>);
if(marks[0])
@@ -223,26 +221,26 @@
// #if (have-free-time)
/*
-// regexp-matches -- exported
+// regex-matches -- exported
//
-// A more convenient form of regexp-position. Usually you want
+// A more convenient form of regex-position. Usually you want
// substrings that were matched by a group rather than the marks for
// the group. How you use this is you give the group numbers you
// want, and it'll give you the strings. (#f if that group wasn't
// matched)
//
-define function regexp-matches
- (big :: <string>, regexp :: <string>,
+define function regex-matches
+ (big :: <string>, regex :: <string>,
#key start: start-index :: <integer> = 0,
end: end-index :: false-or(<integer>),
case-sensitive :: <boolean> = #f,
groups :: false-or(<sequence>))
=> (#rest group-strings :: false-or(<string>));
if (~groups)
- error("Mandatory keyword groups: not used in call to regexp-matches");
+ error("Mandatory keyword groups: not used in call to regex-matches");
end if;
let (#rest marks)
- = regexp-position(big, regexp, start: start-index, end: end-index,
+ = regex-position(big, regex, start: start-index, end: end-index,
case-sensitive: case-sensitive);
let return-val = make(<vector>, size: groups.size, fill: #f);
for (index from 0 below return-val.size)
@@ -278,23 +276,23 @@
*/
-// Functions based on regexp-position
+// Functions based on regex-position
-define function regexp-replace
- (input :: <string>, regexp :: <string>, new-substring :: <string>,
+define function regex-replace
+ (input :: <string>, regex :: <string>, new-substring :: <string>,
#key count = #f, case-sensitive = #f, start = 0, end: input-end = #f)
=> changed-string :: <string>;
let positioner
- = make-regexp-positioner(regexp, case-sensitive: case-sensitive);
+ = make-regex-positioner(regex, case-sensitive: case-sensitive);
do-replacement(positioner, new-substring, input, start,
input-end, count, #t);
-end function regexp-replace;
+end function regex-replace;
-define inline function make-regexp-replacer
- (regexp :: <string>, #key replace-with, case-sensitive = #f)
+define inline function make-regex-replacer
+ (regex :: <string>, #key replace-with, case-sensitive = #f)
=> replacer :: <function>;
let positioner
- = make-regexp-positioner(regexp, case-sensitive: case-sensitive);
+ = make-regex-positioner(regex, case-sensitive: case-sensitive);
if (replace-with)
method (input :: <string>, #key count: count,
start = 0, end: input-end = #f)
@@ -310,141 +308,7 @@
start, input-end, count, #t);
end method;
end if;
-end function make-regexp-replacer;
-
-// equivalent of Perl's tr. Does a character by character translation.
-//
-define open generic translate
- (input :: <string>, from-set :: <string>, to-set :: <string>,
- #key delete, start, end: the-end)
- => output :: <string>;
-
-//The existing methods only work on byte-strings.
-//
-define method translate
- (input :: <byte-string>, from-set :: <byte-string>,
- to-set :: <byte-string>,
- #key delete: delete = #f, start = 0, end: input-end = #f)
- => output :: <byte-string>;
- let table = make-translation-table(from-set, to-set, delete: delete);
- run-translator(input, table, start, input-end | size(input));
-end method translate;
-
-define open generic make-translator
- (from-set :: <string>, to-set :: <string>, #key delete)
- => translator :: <function>;
-
-// Again, only byte-strings handled here
-//
-define method make-translator
- (from-set :: <byte-string>, to-set :: <byte-string>,
- #key delete: delete = #f)
- => translator :: <function>;
- let table = make-translation-table(from-set, to-set, delete: delete);
- method (input :: <byte-string>, #key start = 0, end: input-end = #f)
- => output :: <byte-string>;
- run-translator(input, table, start, input-end | size(input));
- end method;
-end method make-translator;
-
-// Used by translate. Not exported.
-//
-define function make-translation-table
- (from-set :: <byte-string>, to-set :: <byte-string>,
- #key delete: delete = #f)
- => table :: <byte-character-table>;
- let from-index = 0;
- let to-index = 0;
- let previous-from = #f;
- let previous-to = #f;
-
- // These local methods are identical except for the
- // choice of variable names and next-from-character signals end of
- // string rather than repeating the last character indefinitely like
- // next-to-character does.
- local method next-from-character ()
- if (from-index >= size(from-set))
- #f;
- elseif (from-set[from-index] = '\\')
- from-index := from-index + 2;
- previous-from := from-set[from-index - 1];
- elseif (from-set[from-index] = '-')
- if (previous-from = from-set[from-index + 1])
- from-index := from-index + 1;
- from-set[from-index];
- else
- previous-from := successor(previous-from);
- // and return that value
- end if;
- else
- from-index := from-index + 1;
- previous-from := from-set[from-index - 1];
- end if;
- end method next-from-character;
-
- local method next-to-character ()
- if (to-index >= size(to-set))
- if (delete) #f else last(to-set) end;
- elseif (to-set[to-index] = '\\')
- to-index := to-index + 2;
- previous-to := to-set[to-index - 1];
- elseif (to-set[to-index] = '-')
- if (previous-to = to-set[to-index + 1])
- to-index := to-index + 1;
- to-set[to-index];
- else
- previous-to := successor(previous-to);
- // and return that value
- end if;
- else
- to-index := to-index + 1;
- previous-to := to-set[to-index - 1];
- end if;
- end method next-to-character;
-
- let table = make(<byte-character-table>);
- // Wish I had keyed-by
- let (state, limit, next, done?, cur-key, cur-elem)
- = forward-iteration-protocol(table);
- for (st = state then next(table, st), until: done?(table, st, limit))
- let c = cur-key(table, st);
- table[c] := c;
- end for;
-
- for (from-char = next-from-character() then next-from-character(),
- to-char = next-to-character() then next-to-character(),
- until: from-char = #f)
- table[from-char] := to-char;
- end for;
-
- table;
-end function make-translation-table;
-
-// Used by translate. Not exported.
-//
-define function run-translator
- (source :: <byte-string>, table :: <byte-character-table>,
- start-index :: <integer>, end-index :: <integer>)
- => output :: <byte-string>;
- let dest-string = copy-sequence(source);
- let dest-index = start-index;
- for (source-index from start-index below end-index)
- let char = source[source-index];
- if (table[char] ~== #f)
- dest-string[dest-index] := table[char];
- dest-index := dest-index + 1;
- end if;
- end for;
-
- // Now resize dest-string, because deleting characters in the
- // translation would make dest-string shorter than we've
- // allocated.
- if (dest-index = end-index)
- dest-string;
- else
- replace-subsequence!(dest-string, "", start: dest-index, end: end-index);
- end if;
-end function run-translator;
+end function make-regex-replacer;
// Like Perl's split function
//
@@ -452,14 +316,14 @@
(pattern :: <string>, input :: <string>,
#key count = #f, remove-empty-items = #t, start = 0, end: input-end = #f)
=> (#rest whole-bunch-of-strings :: <string>);
- let positioner = make-regexp-positioner(pattern);
+ let positioner = make-regex-positioner(pattern);
split-string(positioner, input, start, input-end | size(input),
count, remove-empty-items);
end function split;
define inline function make-splitter
(pattern :: <string>) => splitter :: <function>;
- let positioner = make-regexp-positioner(pattern);
+ let positioner = make-regex-positioner(pattern);
method (string :: <string>, #key count = #f,
remove-empty-items = #t, start = 0, end: input-end = #f)
=> (#rest whole-bunch-of-strings :: <string>);
@@ -542,3 +406,5 @@
end if;
big-string;
end function join;
+
+
Modified: trunk/libraries/regular-expressions/match.dylan
==============================================================================
--- trunk/libraries/regular-expressions/match.dylan (original)
+++ trunk/libraries/regular-expressions/match.dylan Sun Dec 2 15:16:47 2007
@@ -1,4 +1,4 @@
-module: regular-expressions-impl
+module: regular-expressions
author: Nick Kramer (nkramer at cs.cmu.edu)
synopsis: This takes a parsed regular expression and tries to find a match
for it.
@@ -36,7 +36,7 @@
// Details of match:
// This whole thing is rather hairy. Basically, it creates a "path"
-// through the regexp parse tree that corresponds to a match of the
+// through the regex parse tree that corresponds to a match of the
// string. A path is a round trip through a parse tree that starts
// and ends at the root. The part of the path already travelled is the
// call stack, and hints about the untravelled part of the path are
@@ -58,10 +58,10 @@
// its own non-local exit so that it can try to match its part
// differently.
-// As an example, a <union> is "regexp #1 or regexp #2". When
+// As an example, a <union> is "regex #1 or regex #2". When
// descend-re(<union>...) is called, it'll set up a non-local exit and
-// then descend-re on regexp #1. If someone backtracks out of regexp
-// #1, descend-re(<union>) will try regexp #2. If someone backtracks
+// then descend-re on regex #1. If someone backtracks out of regex
+// #1, descend-re(<union>) will try regex #2. If someone backtracks
// out of that, descend-re(<union>) will give up and backtrack itself.
// When this chain of functions completes a match, it'll stumble upon
@@ -80,7 +80,7 @@
// Match-root?: Set things up and call descend-re.
//
define method match-root?
- (re :: <parsed-regexp>, target :: <substring>,
+ (re :: <parsed-regex>, target :: <substring>,
case-sensitive? :: <boolean>, num-groups :: <integer>,
searcher :: false-or(<function>))
=> (answer :: <boolean>, marks :: <sequence>);
@@ -107,7 +107,7 @@
block (fail)
descend-re(re, target, case-sensitive?, index,
marks, fail, list(up-proc));
- error("A regexp should either match or not match. Why did it "
+ error("A regex should either match or not match. Why did it "
"reach this piece of code?");
end block;
end for;
@@ -117,7 +117,7 @@
block (fail)
descend-re(re, target, case-sensitive?, index,
marks, fail, list(up-proc));
- error("A regexp should either match or not match. Why did "
+ error("A regex should either match or not match. Why did "
"it reach this piece of code?");
end block;
end for;
@@ -131,7 +131,7 @@
// starts with "^".
//
define method anchored-match-root?
- (re :: <parsed-regexp>, target :: <substring>,
+ (re :: <parsed-regex>, target :: <substring>,
case-sensitive? :: <boolean>, num-groups :: <integer>,
searcher :: false-or(<function>))
=> (answer :: <boolean>, marks :: <sequence>);
@@ -147,7 +147,7 @@
block (fail)
descend-re(re, target, case-sensitive?, target.start-index,
marks, fail, list(up-proc));
- error("A regexp should either match or not match. Why did it "
+ error("A regex should either match or not match. Why did it "
"reach this piece of code?");
end block;
values(#f); // Failure
@@ -156,7 +156,7 @@
end method anchored-match-root?;
define generic descend-re
- (re :: false-or(<parsed-regexp>), target :: <substring>,
+ (re :: false-or(<parsed-regex>), target :: <substring>,
case-sensitive? :: <boolean>, start-index :: <integer>,
marks :: <mutable-sequence>, backtrack-past-me :: <non-local-exit>,
up-list :: <list> /* of <non-local-exit> */) => ();
@@ -269,7 +269,7 @@
marks :: <mutable-sequence>, backtrack-past-me :: <non-local-exit>,
up-list :: <list>) => ();
local method descend-and-quantify (min :: <integer>, max,
- re :: <parsed-regexp>, index :: <integer>,
+ re :: <parsed-regex>, index :: <integer>,
backtrack-past-me :: <non-local-exit>,
up-list)
Modified: trunk/libraries/regular-expressions/od-library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/od-library.dylan (original)
+++ trunk/libraries/regular-expressions/od-library.dylan Sun Dec 2 15:16:47 2007
@@ -31,52 +31,19 @@
//
//======================================================================
-
-// Added regex module with new API. --cgay, June 2007
+// Revamped. --cgay Dec 2007
define library regular-expressions
use common-dylan;
use string-extensions;
export
- regexp, // new API
- regular-expressions; // old API
-end library regular-expressions;
-
-define module regexp // new API module
- create
- compile-regexp,
- regexp-search,
- <regexp>,
- <regexp-error>,
- <invalid-regexp>,
- regexp-pattern,
- <regexp-match>, // results of a successful search
- regexp-match-group,
- regexp-match-groups,
- group-start,
- group-end,
- group-text,
- <invalid-match-group>;
-end module regexp;
-
-define module regular-expressions // old API module
- create
- regexp-position, make-regexp-positioner,
- regexp-match,
- regexp-replace, make-regexp-replacer,
- regexp-group-count,
- translate, make-translator,
- split, make-splitter,
- join,
- <illegal-regexp>;
- create
- split-string;
-end module regular-expressions;
+ regular-expressions;
+end;
-define module regular-expressions-impl
+define module regular-expressions
use common-dylan,
exclude: {
- split
+ split // todo -- just add a method to this one
};
use string-conversions;
use character-type;
@@ -84,7 +51,32 @@
use %do-replacement;
use %parse-string;
use substring-search;
- use regular-expressions; // old API module
- use regexp; // new API module
-end module regular-expressions-impl;
+ export
+ compile-regex,
+ <regex>,
+ regex-search,
+ regex-group-count,
+ regex-position,
+ make-regex-positioner,
+ regex-match, // todo -- rename to regex-search-strings?
+ regex-replace,
+ make-regex-replacer,
+ <regex-error>,
+ <invalid-regex>,
+ regex-pattern,
+ <regex-match>, // results of a successful search
+ <match-group>,
+ match-group,
+ match-groups,
+ group-start,
+ group-end,
+ group-text,
+ <invalid-match-group>,
+
+ split,
+ make-splitter,
+ join;
+ export
+ split-string; // ???
+end module regular-expressions;
Modified: trunk/libraries/regular-expressions/parse.dylan
==============================================================================
--- trunk/libraries/regular-expressions/parse.dylan (original)
+++ trunk/libraries/regular-expressions/parse.dylan Sun Dec 2 15:16:47 2007
@@ -1,4 +1,4 @@
-module: regular-expressions-impl
+module: regular-expressions
author: Nick Kramer (nkramer at cs.cmu.edu)
copyright: see below
@@ -31,7 +31,7 @@
// This is a program to parse regular expressions. The grammar I'm using is:
//
-// <regexp> ::= <alternative> | <alternative>|<regexp>
+// <regex> ::= <alternative> | <alternative>|<regex>
//
// <alternative> ::= <quantified-atom> | <quantified-atom><alternative>
//
@@ -42,7 +42,7 @@
//
// <atom> ::= <subpattern> | <extended-character>
//
-// <subpattern> ::= (<options> <regexp>)
+// <subpattern> ::= (<options> <regex>)
//
// <options> ::= ?: | ?P<name> | ?P=name | ?# | etc
//
@@ -53,51 +53,51 @@
// expression component. Match.dylan could go into an infinite loop
// if given this.
-define abstract class <parsed-regexp> (<object>)
-end class <parsed-regexp>;
+define abstract class <parsed-regex> (<object>)
+end class <parsed-regex>;
-define class <mark> (<parsed-regexp>)
- slot child :: <parsed-regexp>, required-init-keyword: #"child";
+define class <mark> (<parsed-regex>)
+ slot child :: <parsed-regex>, required-init-keyword: #"child";
constant slot group-number :: <integer>, required-init-keyword: #"group";
end class <mark>;
-// The root of the parsed regexp, i.e., this is what's returned by the parser.
-define class <regexp> (<mark>)
+// The root of the parsed regex, i.e., this is what's returned by the parser.
+define class <regex> (<mark>)
// exported
- constant slot regexp-pattern :: <string>,
+ constant slot regex-pattern :: <string>,
required-init-keyword: pattern:;
// exported
- constant slot regexp-group-count :: <integer>,
+ constant slot regex-group-count :: <integer>,
required-init-keyword: group-count:;
- // internal. This is only needed when making a <regexp-match> after
+ // internal. This is only needed when making a <regex-match> after
// a successful search.
constant slot group-number-to-name :: <table>,
required-init-keyword: group-number-to-name:;
-end class <regexp>;
+end class <regex>;
-define class <union> (<parsed-regexp>) // |
- slot left :: <parsed-regexp>, required-init-keyword: #"left";
- slot right :: <parsed-regexp>, required-init-keyword: #"right";
+define class <union> (<parsed-regex>) // |
+ slot left :: <parsed-regex>, required-init-keyword: #"left";
+ slot right :: <parsed-regex>, required-init-keyword: #"right";
end class <union>;
-define class <alternative> (<parsed-regexp>) // concatenation
- slot left :: <parsed-regexp>, required-init-keyword: #"left";
- slot right :: <parsed-regexp>, required-init-keyword: #"right";
+define class <alternative> (<parsed-regex>) // concatenation
+ slot left :: <parsed-regex>, required-init-keyword: #"left";
+ slot right :: <parsed-regex>, required-init-keyword: #"right";
end class <alternative>;
-define class <parsed-assertion> (<parsed-regexp>)
+define class <parsed-assertion> (<parsed-regex>)
constant slot asserts :: <symbol>, required-init-keyword: #"assertion";
end class <parsed-assertion>;
-define class <quantified-atom> (<parsed-regexp>)
- slot atom :: <parsed-regexp>, required-init-keyword: #"atom";
+define class <quantified-atom> (<parsed-regex>)
+ slot atom :: <parsed-regex>, required-init-keyword: #"atom";
constant slot min-matches :: <integer>, init-value: 0,
init-keyword: #"min";
constant slot max-matches :: false-or(<integer>), init-value: #f,
init-keyword: #"max";
end class <quantified-atom>;
-define abstract class <parsed-atom> (<parsed-regexp>)
+define abstract class <parsed-atom> (<parsed-regex>)
end class <parsed-atom>;
define class <parsed-character> (<parsed-atom>)
@@ -117,21 +117,21 @@
end class <parsed-backreference>;
// Note: I'm pretty sure <simple-error> won't work in GD. --cgay
-define class <regexp-error> (<simple-error>)
-end class <regexp-error>;
+define class <regex-error> (<simple-error>)
+end class <regex-error>;
-define class <illegal-regexp> (<regexp-error>)
- constant slot regexp-pattern :: <string>,
+define class <illegal-regex> (<regex-error>)
+ constant slot regex-pattern :: <string>,
required-init-keyword: #"pattern";
-end class <illegal-regexp>;
+end class <illegal-regex>;
-define sealed domain make (singleton(<illegal-regexp>));
-define sealed domain initialize (<illegal-regexp>);
+define sealed domain make (singleton(<illegal-regex>));
+define sealed domain initialize (<illegal-regex>);
define function parse-error
(pattern :: <string>, format-string :: <string>, #rest format-args)
let msg = apply(format-to-string, format-string, format-args);
- signal(make(<illegal-regexp>,
+ signal(make(<illegal-regex>,
format-string: "Invalid regular expression: %=. %s",
format-arguments: list(pattern, msg),
pattern: pattern));
@@ -143,16 +143,16 @@
else
thing
end;
- signal(make(<regexp-error>,
+ signal(make(<regex-error>,
format-string: "The %s is not yet implemented.",
format-arguments: list(thing)));
end;
-// <parse-info> contains some information about the current regexp
+// <parse-info> contains some information about the current regex
// being parsed.
//
define class <parse-info> (<object>)
- // Whether or not the function includes \1, \2, etc in the regexp.
+ // Whether or not the function includes \1, \2, etc in the regex.
// Name this has-backreferences, for consistency with the other slots.
// Add ? to all the has-* slots. --cgay
// Also, not sure why anyone cares about these three things.
@@ -184,36 +184,29 @@
define method has-named-group?
(info :: <parse-info>, name :: <string>)
- block (return)
- for (group-name in info.group-number-to-name)
- if (name = group-name)
- return(#t)
- end;
- end;
- #f
- end;
+ member?(name, info.group-number-to-name, test: \=)
end;
define method parse
- (regexp :: <string>, parse-info :: <parse-info>)
- => (parsed-regexp :: <parsed-regexp>,
+ (regex :: <string>, parse-info :: <parse-info>)
+ => (parsed-regex :: <parsed-regex>,
last-group :: <integer>,
backrefs? :: <boolean>,
alternatives? :: <boolean>,
quantifiers? :: <boolean>)
- let parse-string = make(<parse-string>, string: regexp);
- let child = parse-regexp(parse-string, parse-info);
- let parse-tree = make(<regexp>,
- pattern: regexp,
+ let parse-string = make(<parse-string>, string: regex);
+ let child = parse-regex(parse-string, parse-info);
+ let parse-tree = make(<regex>,
+ pattern: regex,
group: 0,
group-count: parse-info.current-group-number + 1,
group-number-to-name: parse-info.group-number-to-name,
child: child);
- let optimized-regexp = optimize(parse-tree);
- if (optimized-regexp.pathological?)
- parse-error(regexp, "A subpattern that matches the empty string was quantified.");
+ let optimized-regex = optimize(parse-tree);
+ if (optimized-regex.pathological?)
+ parse-error(regex, "A subpattern that matches the empty string was quantified.");
else
- values(optimized-regexp,
+ values(optimized-regex,
parse-info.current-group-number,
parse-info.backreference-used,
parse-info.has-alternatives,
@@ -221,22 +214,22 @@
end if;
end method parse;
-define method parse-regexp (s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+define method parse-regex (s :: <parse-string>, info :: <parse-info>)
+ => parsed-regex :: <parsed-regex>;
let alternative = parse-alternative(s, info);
if (~alternative)
parse-error(s.parse-string, "");
elseif (lookahead(s) = '|')
info.has-alternatives := #t;
- make(<union>, left: alternative, right: parse-regexp(consume(s), info));
+ make(<union>, left: alternative, right: parse-regex(consume(s), info));
else
alternative;
end if;
-end method parse-regexp;
+end method parse-regex;
define method parse-alternative
(s :: <parse-string>, info :: <parse-info>)
- => (re :: false-or(<parsed-regexp>))
+ => (re :: false-or(<parsed-regex>))
let term = parse-quantified-atom(s, info);
if (member?(lookahead(s), #(#f, '|', ')')))
term;
@@ -246,7 +239,7 @@
end method parse-alternative;
define method parse-quantified-atom (s :: <parse-string>, info :: <parse-info>)
- => (result :: false-or(<parsed-regexp>))
+ => (result :: false-or(<parsed-regex>))
// I think this breaks when parse-atom returns #f and then we quantify that.
// I added some regexes to regression-tests.txt starting with /a()b/ that I
// hope will exercise that case. --cgay
@@ -281,7 +274,7 @@
// {m,n}, {m,}, {,n}, {m}, {}, and {,} are all valid.
// m defaults to 0 and n defaults to #f (unlimited).
define method parse-minmax-quantifier
- (atom :: <parsed-regexp>, s :: <parse-string>) => (qatom :: <quantified-atom>)
+ (atom :: <parsed-regex>, s :: <parse-string>) => (qatom :: <quantified-atom>)
local method parse-integer () => (int :: false-or(<integer>))
let digits = make(<deque>);
while (lookahead(s) & digit?(lookahead(s)))
@@ -308,7 +301,7 @@
end method parse-minmax-quantifier;
define method parse-atom (s :: <parse-string>, info :: <parse-info>)
- => (regexp :: false-or(<parsed-regexp>))
+ => (regex :: false-or(<parsed-regex>))
let char = lookahead(s);
select (char)
'(' =>
@@ -359,7 +352,7 @@
//
define inline function parse-group
(str :: <parse-string>, info :: <parse-info>)
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
let char = lookahead(str);
if (char == '?')
consume(str);
@@ -373,7 +366,7 @@
//
define inline function parse-extended-group
(str :: <parse-string>, info :: <parse-info>)
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
let char = lookahead(str);
consume(str);
select (char)
@@ -398,18 +391,18 @@
consume(str);
end;
if (~ lookahead(str))
- parse-error(str.parse-string, "Unterminated (?# comment.");
+ parse-error(str.parse-string, "Unterminated subpattern commend (?#....");
else
#f
end;
otherwise =>
// See the Python re docs for what all these do.
- if (member?(char, "iLmsux#=!<("))
- not-yet-implemented("'(?%c' subpattern construct", lookahead(str));
+ if (member?(char, "iLmsux=!<("))
+ not-yet-implemented("'(?%c' subpattern construct", char);
else
- parse-error(str.parse-string, "Invalid (? construct at index %s.",
- str.parse-index);
+ parse-error(str.parse-string, "Invalid subpattern construct (?%c...) at index %s.",
+ char, str.parse-index);
end;
end select
end function parse-extended-group;
@@ -440,18 +433,18 @@
info :: <parse-info>,
save-group? :: <boolean>,
group-name :: false-or(<string>))
- => (mark :: false-or(<parsed-regexp>))
+ => (mark :: false-or(<parsed-regex>))
if (save-group?)
info.current-group-number := info.current-group-number + 1;
end;
- let regexp = parse-regexp(str, info);
+ let regex = parse-regex(str, info);
if (lookahead(str) ~== ')')
- parse-error(str.parse-string, "Unbalanced parens in regexp (index = %s).",
+ parse-error(str.parse-string, "Unbalanced parens in regex (index = %s).",
str.parse-index);
else
consume(str);
if (~ save-group?)
- regexp
+ regex
else
if (group-name)
if (has-named-group?(info, group-name))
@@ -462,7 +455,7 @@
info.group-number-to-name[info.current-group-number] := group-name;
end;
end;
- make(<mark>, child: regexp, group: info.current-group-number)
+ make(<mark>, child: regex, group: info.current-group-number)
end
end
end function parse-simple-group;
@@ -544,7 +537,7 @@
//
define method parse-escaped-character
(s :: <parse-string>, info :: <parse-info>)
- => parsed-regexp :: <parsed-regexp>;
+ => parsed-regex :: <parsed-regex>;
let next-char = lookahead(s);
if (~next-char)
parse-error(s.parse-string,
@@ -583,36 +576,36 @@
end select;
end method parse-escaped-character;
-define method is-anchored? (regexp :: <parsed-regexp>)
+define method is-anchored? (regex :: <parsed-regex>)
=> (result :: <boolean>);
- select (regexp by instance?)
- <mark> => is-anchored?(regexp.child);
- <alternative> => is-anchored?(regexp.left);
- <parsed-assertion> => regexp.asserts == #"beginning-of-string";
+ select (regex by instance?)
+ <mark> => is-anchored?(regex.child);
+ <alternative> => is-anchored?(regex.left);
+ <parsed-assertion> => regex.asserts == #"beginning-of-string";
otherwise => #f;
end select;
end method is-anchored?;
-define method initial-substring (regexp :: <parsed-regexp>)
+define method initial-substring (regex :: <parsed-regex>)
=> (result :: <string>);
let result = make(<deque>);
- local method init (regexp :: <parsed-regexp>, result :: <deque>)
- select (regexp by instance?)
+ local method init (regex :: <parsed-regex>, result :: <deque>)
+ select (regex by instance?)
<alternative> =>
- init(regexp.left, result) & init(regexp.right, result);
+ init(regex.left, result) & init(regex.right, result);
<parsed-character> =>
- push-last(result, regexp.character);
+ push-last(result, regex.character);
<parsed-string> =>
- for (ch in regexp.string) push-last(result, ch) end for;
+ for (ch in regex.string) push-last(result, ch) end for;
<mark> =>
- init(regexp.child, result);
+ init(regex.child, result);
<parsed-assertion> =>
#t;
otherwise =>
#f;
end select;
end method init;
- init(regexp, result);
+ init(regex, result);
as(<byte-string>, result);
end method initial-substring;
@@ -620,17 +613,17 @@
// Currently the only optimization is merging adjacent characters into
// a string.
//
-define method optimize (regexp :: <parsed-regexp>)
- => (regexp :: <parsed-regexp>);
- select (regexp by instance?)
+define method optimize (regex :: <parsed-regex>)
+ => (regex :: <parsed-regex>);
+ select (regex by instance?)
<mark> =>
- regexp.child := optimize(regexp.child);
- regexp;
+ regex.child := optimize(regex.child);
+ regex;
<alternative> =>
- if (instance?(regexp.left, <parsed-character>))
+ if (instance?(regex.left, <parsed-character>))
let result-str = make(<deque>);
- push-last(result-str, regexp.left.character);
- for (next = regexp.right then next.right,
+ push-last(result-str, regex.left.character);
+ for (next = regex.right then next.right,
while: (instance?(next, <alternative>)
& instance?(next.left, <parsed-character>)))
push-last(result-str, next.left.character)
@@ -639,8 +632,8 @@
push-last(result-str, next.character);
make(<parsed-string>, string: as(<string>, result-str));
elseif (result-str.size = 1)
- regexp.right := optimize(regexp.right);
- regexp;
+ regex.right := optimize(regex.right);
+ regex;
else
make(<alternative>,
left: make(<parsed-string>, string: as(<string>, result-str)),
@@ -648,36 +641,36 @@
end if;
end for;
else
- regexp.left := optimize(regexp.left);
- regexp.right := optimize(regexp.right);
- regexp;
+ regex.left := optimize(regex.left);
+ regex.right := optimize(regex.right);
+ regex;
end if;
<union> =>
- regexp.left := optimize(regexp.left);
- regexp.right := optimize(regexp.right);
- regexp;
+ regex.left := optimize(regex.left);
+ regex.right := optimize(regex.right);
+ regex;
<quantified-atom> =>
- regexp.atom := optimize(regexp.atom);
- regexp;
+ regex.atom := optimize(regex.atom);
+ regex;
otherwise =>
- regexp;
+ regex;
end select;
end method optimize;
// We have to somehow deal with pathological regular expressions like
// ".**". Perl simply signals an error in this case. We *could* in
-// fact match these pathological regexps using the formulation below,
+// fact match these pathological regexs using the formulation below,
// but it doesn't seem worth the trouble. Frankly, I doubt anyone has
-// ever tried to use such a pathological regexp and *not* have done it
+// ever tried to use such a pathological regex and *not* have done it
// by mistake. But in case I'm wrong, here's how to fix a
-// pathological regexp:
+// pathological regex:
//
-// First, realize that pathological regexps stem from infinitely
+// First, realize that pathological regexs stem from infinitely
// quantifying subpatterns that could match the empty string. So what
// we do is find this subpattern, and perform the following
// transformation:
//
-// case (type of regexp)
+// case (type of regex)
// r1r2 => r1'r2|r2'
// r1|r2 => r1'|r2'
// r1{0,n} => r1'{1,n}
@@ -685,90 +678,90 @@
// atom => atom
// assertion => can't be done
//
-// This transformation turns a might-match-emptystring regexp into a
-// regexp that matches the same set of strings minus the empty string.
+// This transformation turns a might-match-emptystring regex into a
+// regex that matches the same set of strings minus the empty string.
// If this transformation can't be done, remember that "$*" is
// equivalent to "always true and consumes no input".
-define generic matches-empty-string? (regexp :: <parsed-regexp>)
+define generic matches-empty-string? (regex :: <parsed-regex>)
=> answer :: <boolean>;
-define method matches-empty-string? (regexp :: <parsed-atom>)
+define method matches-empty-string? (regex :: <parsed-atom>)
=> answer :: <boolean>;
#f;
end method matches-empty-string?;
-define method matches-empty-string? (regexp :: <parsed-assertion>)
+define method matches-empty-string? (regex :: <parsed-assertion>)
=> answer :: <boolean>;
#t;
end method matches-empty-string?;
-define method matches-empty-string? (regexp :: <mark>)
+define method matches-empty-string? (regex :: <mark>)
=> answer :: <boolean>;
- regexp.child.matches-empty-string?;
+ regex.child.matches-empty-string?;
end method matches-empty-string?;
-define method matches-empty-string? (regexp :: <union>)
+define method matches-empty-string? (regex :: <union>)
=> answer :: <boolean>;
- regexp.left.matches-empty-string? | regexp.right.matches-empty-string?;
+ regex.left.matches-empty-string? | regex.right.matches-empty-string?;
end method matches-empty-string?;
-define method matches-empty-string? (regexp :: <alternative>)
+define method matches-empty-string? (regex :: <alternative>)
=> answer :: <boolean>;
- regexp.left.matches-empty-string? & regexp.right.matches-empty-string?;
+ regex.left.matches-empty-string? & regex.right.matches-empty-string?;
end method matches-empty-string?;
-define method matches-empty-string? (regexp :: <quantified-atom>)
+define method matches-empty-string? (regex :: <quantified-atom>)
=> answer :: <boolean>;
- regexp.min-matches == 0 | regexp.atom.matches-empty-string?;
+ regex.min-matches == 0 | regex.atom.matches-empty-string?;
end method matches-empty-string?;
-define generic pathological? (regexp :: <parsed-regexp>)
+define generic pathological? (regex :: <parsed-regex>)
=> answer :: <boolean>;
-define method pathological? (regexp :: <parsed-atom>)
+define method pathological? (regex :: <parsed-atom>)
=> answer :: <boolean>;
#f;
end method pathological?;
-define method pathological? (regexp :: <parsed-assertion>)
+define method pathological? (regex :: <parsed-assertion>)
=> answer :: <boolean>;
#f;
end method pathological?;
-define method pathological? (regexp :: <mark>)
+define method pathological? (regex :: <mark>)
=> answer :: <boolean>;
- regexp.child.pathological?;
+ regex.child.pathological?;
end method pathological?;
-define method pathological? (regexp :: <union>)
+define method pathological? (regex :: <union>)
=> answer :: <boolean>;
- regexp.left.pathological? | regexp.right.pathological?;
+ regex.left.pathological? | regex.right.pathological?;
end method pathological?;
-define method pathological? (regexp :: <alternative>)
+define method pathological? (regex :: <alternative>)
=> answer :: <boolean>;
- regexp.left.pathological? | regexp.right.pathological?;
+ regex.left.pathological? | regex.right.pathological?;
end method pathological?;
-define method pathological? (regexp :: <quantified-atom>)
+define method pathological? (regex :: <quantified-atom>)
=> answer :: <boolean>;
- regexp.max-matches == #f & regexp.atom.matches-empty-string?;
+ regex.max-matches == #f & regex.atom.matches-empty-string?;
end method pathological?;
// Seals for file parse.dylan
-// <mark> -- subclass of <parsed-regexp>
+// <mark> -- subclass of <parsed-regex>
define sealed domain make(singleton(<mark>));
-// <union> -- subclass of <parsed-regexp>
+// <union> -- subclass of <parsed-regex>
define sealed domain make(singleton(<union>));
-// <alternative> -- subclass of <parsed-regexp>
+// <alternative> -- subclass of <parsed-regex>
define sealed domain make(singleton(<alternative>));
-// <parsed-assertion> -- subclass of <parsed-regexp>
+// <parsed-assertion> -- subclass of <parsed-regex>
define sealed domain make(singleton(<parsed-assertion>));
-// <quantified-atom> -- subclass of <parsed-regexp>
+// <quantified-atom> -- subclass of <parsed-regex>
define sealed domain make(singleton(<quantified-atom>));
// <parsed-character> -- subclass of <parsed-atom>
define sealed domain make(singleton(<parsed-character>));
Modified: trunk/libraries/regular-expressions/regex.dylan
==============================================================================
--- trunk/libraries/regular-expressions/regex.dylan (original)
+++ trunk/libraries/regular-expressions/regex.dylan Sun Dec 2 15:16:47 2007
@@ -1,4 +1,4 @@
-Module: regular-expressions-impl
+Module: regular-expressions
Author: Carl Gay
Synopsis: A new API for the regular-expressions library
@@ -7,7 +7,7 @@
-define constant <invalid-regexp> = <illegal-regexp>;
+define constant <invalid-regex> = <illegal-regex>;
// Compile the given string into an optimized regular expression.
@@ -26,77 +26,77 @@
// @param dot-matches-all -- Normally '.' matches any character except for
// newline. If this parameter is true '.' matches newline as well.
//
-// This function signals <invalid-regexp> if the regular expression is invalid.
+// This function signals <invalid-regex> if the regular expression is invalid.
//
-define sealed generic compile-regexp
+define sealed generic compile-regex
(string :: <string>,
#key case-sensitive :: <boolean> = #t,
verbose :: <boolean> = #f,
multi-line :: <boolean> = #f,
dot-matches-all :: <boolean> = #f)
- => (regexp :: <regexp>);
+ => (regex :: <regex>);
-define method compile-regexp
+define method compile-regex
(string :: <string>,
#key case-sensitive :: <boolean> = #t,
verbose :: <boolean> = #f,
multi-line :: <boolean> = #f,
dot-matches-all :: <boolean> = #f)
- => (regexp :: <regexp>)
+ => (regex :: <regex>)
parse(string,
make-parse-info(case-sensitive: case-sensitive,
verbose: verbose,
multi-line: multi-line,
dot-matches-all: dot-matches-all))
-end method compile-regexp;
+end method compile-regex;
-// Returns a <regexp-match> containing info about a successful match, or #f if
+// Returns a <regex-match> containing info about a successful match, or #f if
// no match was found.
//
// @param big -- The string in which to search.
-// @param pattern -- The pattern to search for. If not a <regexp>, it will be
-// compiled first with compile-regexp (implying that <invalid-regexp> may be
+// @param pattern -- The pattern to search for. If not a <regex>, it will be
+// compiled first with compile-regex (implying that <invalid-regex> may be
// signalled), using the defaults for the keyword arguments. If you wish
-// to override them, call compile-regexp directly.
+// to override them, call compile-regex directly.
// @param anchored -- Whether or not the search should be anchored at the start
// position. This is useful because "^..." will only match at the beginning
-// of a string, or after \n if the regexp was compiled with multi-line = #t.
+// of a string, or after \n if the regex was compiled with multi-line = #t.
// @param start -- Where to begin the search.
// @param end -- Where to stop searching.
//
// todo -- Should $ anchor at the provided end position or at the end of the string?
//
-define sealed generic regexp-search
+define sealed generic regex-search
(big :: <string>, pattern :: <object>,
#key anchored :: <boolean> = #f,
start :: <integer> = 0,
end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>));
+ => (match :: false-or(<regex-match>));
-define method regexp-search
+define method regex-search
(big :: <string>, pattern :: <string>,
#key anchored :: <boolean> = #f,
start :: <integer> = 0,
end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>))
- regexp-search(big, compile-regexp(pattern),
+ => (match :: false-or(<regex-match>))
+ regex-search(big, compile-regex(pattern),
anchored: anchored, start: start, end: _end)
-end method regexp-search;
+end method regex-search;
-define method regexp-search
- (big :: <string>, pattern :: <regexp>,
+define method regex-search
+ (big :: <string>, pattern :: <regex>,
#key anchored :: <boolean> = #f,
start :: <integer> = 0,
end: _end :: <integer> = big.size)
- => (match :: false-or(<regexp-match>))
- // Copied from regexp-position with some mods to match our interface.
- // Unlike regexp-position there is no caching. If you don't want to
- // recompile your regexp each time, compile it explicitly with compile-regexp
+ => (match :: false-or(<regex-match>))
+ // Copied from regex-position with some mods to match our interface.
+ // Unlike regex-position there is no caching. If you don't want to
+ // recompile your regex each time, compile it explicitly with compile-regex
// and save it.
let substring = make(<substring>, string: big, start: start, end: _end);
let case-sensitive? = #t;
- let num-groups = pattern.regexp-group-count;
+ let num-groups = pattern.regex-group-count;
let (matched?, marks)
= if (pattern.is-anchored?)
anchored-match-root?(pattern, substring, case-sensitive?, num-groups, #f);
@@ -107,7 +107,7 @@
match-root?(pattern, substring, case-sensitive?, num-groups, searcher);
end if;
if (matched?)
- let regexp-match = make(<regexp-match>, regular-expression: pattern);
+ let regex-match = make(<regex-match>, regular-expression: pattern);
let group-number-to-name :: <table> = pattern.group-number-to-name;
for (index from 0 below marks.size by 2)
let group-number = floor/(index, 2);
@@ -120,35 +120,27 @@
let text = copy-sequence(substring.entire-string,
start: substring.start-index + bpos,
end: substring.start-index + epos);
- add-group(regexp-match,
+ add-group(regex-match,
make(<match-group>, text: text, start: bpos, end: epos),
group-name);
else
// This group wasn't matched.
- add-group(regexp-match, #f, group-name);
+ add-group(regex-match, #f, group-name);
end;
end;
- regexp-match
+ regex-match
else
#f
end
-end method regexp-search;
-
-// This has methods for group :: <string> and group :: <integer>.
-// Group zero is always the entire match.
-define sealed generic regexp-match-group
- (match :: <regexp-match>, group :: <object>)
- => (text :: false-or(<string>),
- start-index :: false-or(<integer>),
- end-index :: false-or(<integer>));
+end method regex-search;
// Get the groups for the match. There will always be at least one; the entire match.
//
-define sealed generic regexp-match-groups
- (match :: <regexp-match>) => (groups :: <sequence>);
+define sealed generic match-groups
+ (match :: <regex-match>) => (groups :: <sequence>);
-define method regexp-match-groups
- (match :: <regexp-match>) => (groups :: <sequence>)
+define method match-groups
+ (match :: <regex-match>) => (groups :: <sequence>)
map-as(<simple-object-vector>, identity, match.groups-by-position)
end;
@@ -161,20 +153,20 @@
required-init-keyword: end:;
end class <match-group>;
-define sealed class <regexp-match> (<object>)
+define sealed class <regex-match> (<object>)
// Groups by position. Zero is the entire match.
constant slot groups-by-position :: <stretchy-vector> = make(<stretchy-vector>);
// Named groups, if any. Initial size 0 on the assumption that most regular
// expressions won't use named groups.
constant slot groups-by-name :: <string-table> = make(<string-table>, size: 0);
- constant slot regular-expression :: <regexp>, required-init-keyword: regular-expression:;
-end class <regexp-match>;
+ constant slot regular-expression :: <regex>, required-init-keyword: regular-expression:;
+end class <regex-match>;
define method add-group
- (match :: <regexp-match>,
+ (match :: <regex-match>,
group :: false-or(<match-group>),
name :: false-or(<string>))
- => (match :: <regexp-match>)
+ => (match :: <regex-match>)
add!(match.groups-by-position, group);
if (name)
match.groups-by-name[name] := group;
@@ -182,11 +174,19 @@
match
end;
-define sealed class <invalid-match-group> (<regexp-error>)
+define sealed class <invalid-match-group> (<regex-error>)
end class <invalid-match-group>;
-define method regexp-match-group
- (match :: <regexp-match>, group-number :: <integer>)
+// This has methods for group :: <string> and group :: <integer>.
+// Group zero is always the entire match.
+define sealed generic match-group
+ (match :: <regex-match>, group :: <object>)
+ => (text :: false-or(<string>),
+ start-index :: false-or(<integer>),
+ end-index :: false-or(<integer>));
+
+define method match-group
+ (match :: <regex-match>, group-number :: <integer>)
=> (text :: false-or(<string>),
start-index :: false-or(<integer>),
end-index :: false-or(<integer>))
@@ -202,17 +202,17 @@
signal(make(<invalid-match-group>,
format-string: "Group number %d is out of bounds for regex %s match. %s",
format-arguments: list(group-number,
- match.regular-expression.regexp-pattern,
+ match.regular-expression.regex-pattern,
if (ng == 1)
"There is only 1 group."
else
format-to-string("There are %d groups.", ng)
end)));
end;
-end method regexp-match-group;
+end method match-group;
-define method regexp-match-group
- (match :: <regexp-match>, group :: <string>)
+define method match-group
+ (match :: <regex-match>, group :: <string>)
=> (text :: false-or(<string>),
start-index :: false-or(<integer>),
end-index :: false-or(<integer>))
@@ -224,4 +224,4 @@
format-string: "There is no group named %=.",
format-arguments: list(group)));
end
-end method regexp-match-group;
+end method match-group;
Modified: trunk/libraries/regular-expressions/tests/library.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/library.dylan (original)
+++ trunk/libraries/regular-expressions/tests/library.dylan Sun Dec 2 15:16:47 2007
@@ -6,11 +6,7 @@
import: {
streams
};
- use regular-expressions,
- import: {
- regular-expressions,
- regexp
- };
+ use regular-expressions;
use system,
import: {
file-system,
@@ -23,24 +19,12 @@
regular-expressions-test-suite;
end library regular-expressions-test-suite;
-define module old-api-test-suite
- use common-dylan,
- exclude: {
- split
- };
- use regular-expressions;
- use testworks;
- export
- //pcre-test-suite,
- old-api-test-suite;
-end module old-api-test-suite;
-
-define module new-api-test-suite
+define module regular-expressions-test-suite
use common-dylan,
exclude: {
split
};
- use regexp;
+ use regular-expressions;
use file-system;
use locators,
import: {
@@ -58,14 +42,6 @@
import: {
trim
};
- export
- new-api-test-suite;
-end module new-api-test-suite;
-
-define module regular-expressions-test-suite
- use testworks;
- use old-api-test-suite;
- use new-api-test-suite;
export regular-expressions-test-suite;
end module regular-expressions-test-suite;
Added: trunk/libraries/regular-expressions/tests/pcre.dylan
==============================================================================
--- (empty file)
+++ trunk/libraries/regular-expressions/tests/pcre.dylan Sun Dec 2 15:16:47 2007
@@ -0,0 +1,238 @@
+module: regular-expressions-test-suite
+
+//// Tests based on PCRE test output files
+
+// todo -- why does this fail to match anything if it has a leading space???
+define constant $group-regex = compile-regex("[0-9]: (.*)|No match");
+define constant $group-index-of-what-pcre-matched = 1;
+
+define function run-pcre-checks
+ (pathname :: <pathname>) => ()
+ with-open-file(stream = pathname, direction: #"input")
+ let line = #f;
+ let line-number = 0;
+ let lines = make(<stretchy-vector>);
+ while (line := read-line(stream, on-end-of-stream: #f))
+ line-number := line-number + 1;
+ line := trim(line, from: #"left");
+ if (empty?(line))
+ // A section break is always preceded by a group result or "No match".
+ // Some multi-line regular expression patterns have empty lines and we
+ // don't want to think that's the end of the section.
+ if (lines.size > 0
+ & regex-search(lines[lines.size - 1], $group-regex))
+ check-pcre-section(make(<section>,
+ lines: lines,
+ start-line-number: line-number - lines.size));
+ lines := make(<stretchy-vector>);
+ elseif (lines.size < 3)
+ // A section must have a regex, a test string and one group result to make
+ // any sense. Note we don't add the line to the section here.
+ test-output("Line %d: empty line in first 3 lines of a section.\n",
+ line-number);
+ else
+ add!(lines, line);
+ end;
+ else
+ add!(lines, line);
+ end;
+ end while;
+ end;
+end function run-pcre-checks;
+
+define class <section> (<object>)
+ constant slot section-lines :: <sequence>, required-init-keyword: #"lines";
+ constant slot start-line-number :: <integer>, required-init-keyword: #"start-line-number";
+ slot %index :: <integer> = 0;
+end class <section>;
+
+define method consume-line
+ (section :: <section>) => (line :: false-or(<string>))
+ let index = section.%index;
+ let lines = section.section-lines;
+ if (index < lines.size)
+ section.%index := index + 1;
+ lines[index]
+ end
+end method consume-line;
+
+/* Was going to be used for better check names.
+define method line-number
+ (section :: <section>) => (line-number :: <integer>)
+ section.start-line-number + section.%index
+end method line-number;
+*/
+
+define method peek-line
+ (section :: <section>) => (line :: false-or(<string>))
+ if (section.%index < section.section-lines.size)
+ section.section-lines[section.%index]
+ end
+end method peek-line;
+
+/* Example section:
+ /([\da-f:]+)$/i // pattern and flags
+ 0abc // test string
+ 0: 0abc // group 0
+ 1: 0abc // group 1
+ 0zzz // test string
+ No match // no match
+*/
+define function check-pcre-section
+ (section :: <section>)
+ let regex = parse-pcre-regex(section);
+ // If the section has fewer than 3 lines (a regex, a test string and at least
+ // one group result) then all we do is try to compile it (above).
+ if (section.section-lines.size >= 3)
+ while (peek-line(section))
+ let test-string = consume-line(section);
+ //test-output(" test string: %s\n", test-string);
+ let group-strings = make(<stretchy-vector>);
+ block (done-with-this-test-string)
+ while (#t)
+ let line = peek-line(section);
+ let match = line & regex-search(line, $group-regex);
+ if (match)
+ consume-line(section);
+ let group-text = match-group(match, $group-index-of-what-pcre-matched);
+ //test-output(" pcre group: %s\n", group-text | "No match");
+ if (group-text)
+ add!(group-strings, group-text);
+ else
+ assert(match-group(match, 0) = "No match",
+ "previous line was 'No match'");
+ done-with-this-test-string();
+ end;
+ else
+ done-with-this-test-string();
+ end;
+ end;
+ end;
+ if (regex)
+ check-no-errors(format-to-string("search for %s in %s",
+ test-string, regex.regex-pattern),
+ regex-search(test-string, regex));
+ let match = block ()
+ regex-search(test-string, regex)
+ exception (ex :: <error>)
+ #f
+ end;
+ if (match)
+ compare-to-pcre-results(regex.regex-pattern, test-string, match, group-strings);
+ end;
+ end if;
+ end while;
+ end if;
+end function check-pcre-section;
+
+define function parse-pcre-regex
+ (section :: <section>)
+ => (regex :: false-or(<regex>))
+ local method find-last (string, char)
+ // position(string, char, from-end: #t)
+ block (break)
+ for (i from string.size - 1 to 0 by -1)
+ if (string[i] == char)
+ break(i);
+ end;
+ end;
+ end;
+ end method find-last;
+ // This is imprecise. If the pattern spans multiple lines this will fail
+ // if any but the first and last lines contain the regex delimiter character.
+ // Might be nice to add a function to the regular-expressions module to read
+ // a perl regexp from a stream.
+ local method read-pattern-and-flags ()
+ let pnf = consume-line(section);
+ let delim = pnf[0];
+ if (find-last(pnf, delim) == 0)
+ // it's a multi-line regex
+ while(peek-line(section) & ~find-last(peek-line(section), delim))
+ pnf := concatenate(pnf, "\n", consume-line(section));
+ end;
+ if (peek-line(section))
+ pnf := concatenate(pnf, "\n", consume-line(section));
+ end;
+ end;
+ let end-delim = find-last(pnf, delim);
+ let flags = copy-sequence(pnf, start: end-delim + 1);
+ let pattern = copy-sequence(pnf, start: 1, end: end-delim);
+ values(pattern, flags)
+ end method read-pattern-and-flags;
+ let (pattern, flags) = read-pattern-and-flags();
+ //test-output("pattern: %s (flags = %s)\n", pattern, flags);
+ for (flag in flags)
+ check-true(format-to-string("For regex %s, flag %s is recognized",
+ pattern, flag),
+ member?(flag, "ixms"));
+ end for;
+ block ()
+ compile-regex(pattern,
+ case-sensitive: ~ member?('i', flags),
+ verbose: member?('x', flags),
+ multi-line: member?('m', flags),
+ dot-matches-all: member?('s', flags))
+ // Unfortunately we can't catch <regex-error> here because the charset
+ // parser is in string-extensions and signals <invalid-character-set-description>
+ // which isn't related to <regex-error> (and isn't even exported).
+ exception (ex :: <error>)
+ check-true(format-to-string("can compile regex %s", pattern), #f);
+ //test-output(" ERROR: %s\n", ex);
+ #f
+ end block
+end function parse-pcre-regex;
+
+/*
+ * pcre-groups is a sequence of strings where the nth element represents
+ * the nth group in the pcre regex match. If pcre-groups is empty then
+ * there was no match.
+ */
+define function compare-to-pcre-results
+ (pattern :: <string>,
+ test-string :: <string>,
+ match :: false-or(<regex-match>),
+ pcre-groups :: <sequence>)
+ => ()
+ if (match)
+ check-equal(format-to-string("Match %s against %s -- same # of groups",
+ test-string, pattern),
+ size(match-groups(match)),
+ pcre-groups.size);
+ for (group-number from 0,
+ pcre-group in pcre-groups)
+ // Adding block/exception here causes an infinite loop.
+ // Could it be related to using the Visual Studio 8 linker?
+ // The if also causes an infinite loop. Hmmm.
+ let our-group = /* if (group-number < size(match-groups(match))) */
+ match-group(match, group-number)
+ /* end */;
+ check-equal(format-to-string("Match %s against %s -- group %d is the same",
+ test-string, pattern, group-number),
+ our-group,
+ pcre-group);
+ end;
+ else
+ check-equal(format-to-string("Pattern %s doesn't match test string %s",
+ pattern, test-string),
+ 0,
+ pcre-groups.size);
+ end if;
+end function compare-to-pcre-results;
+
+define function make-pcre-locator
+ (filename :: <string>) => (locator :: <file-locator>)
+ let source-directory = environment-variable("OPEN_DYLAN_USER_SOURCES");
+ if (source-directory)
+ let dir = subdirectory-locator(as(<directory-locator>, source-directory),
+ "libraries",
+ "regular-expressions",
+ "tests");
+ make(<file-locator>, directory: dir, name: filename)
+ else
+ signal(make(<simple-error>,
+ format-string: "pcre-test requires the OPEN_DYLAN_USER_SOURCES environment "
+ "variable to be set to the root of your Dylan sources."));
+ end
+end function make-pcre-locator;
+
+
Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan (original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.dylan Sun Dec 2 15:16:47 2007
@@ -1,8 +1,84 @@
Module: regular-expressions-test-suite
Author: Carl Gay
+define function re/position (string, pattern, #rest args)
+ let (#rest marks) = apply(regex-position, string, pattern, args);
+ marks
+end function re/position;
+
+define test atom-test ()
+ // In current code the empty string is an illegal regex, but Python
+ // (and probably perl?) allow it, so I think we should consider that
+ // a bug. --cgay
+ check-no-errors("atom-0", re/position("", ""));
+ check-equal("atom-1", re/position("a", "a"), #[0, 1]);
+ check-equal("atom-2", re/position("a", "[a]"), #[0, 1]);
+ check-equal("atom-3", re/position("ab", "(a)b"), #[0, 2, 0, 1]);
+ check-equal("atom-4", re/position("a", "\\w"), #[0, 1]);
+ check-equal("atom-5", re/position("a", "."), #[0, 1]);
+ check-equal("atom-6", re/position("a", "a{0}"), #[0, 0]);
+ check-equal("atom-7", re/position("aa", "a{2}"), #[0, 2]);
+ check-equal("atom-8", re/position("aa", "a{1,}"), #[0, 2]);
+ check-equal("atom-9", re/position("aaa", "a{1,8}"), #[0, 3]);
+ check-equal("atom-A", re/position("", "a{,}"), #[0, 0]);
+ check-equal("atom-A1", re/position("aaaaaa", "a{,}"), #[0, 6]);
+ check-condition("atom-B", <invalid-regex>, re/position("", "a{m,n}"));
+ check-condition("atom-C", <invalid-regex>, re/position("", "a{m,}"));
+ check-condition("atom-D", <invalid-regex>, re/position("", "a{,n}"));
+ check-condition("atom-E", <invalid-regex>, re/position("", "a{m}"));
+ check-condition("atom-F", <invalid-regex>, re/position("", "a{,"));
+ check-condition("atom-G", <invalid-regex>, re/position("", "[a"));
+ check-condition("atom-H", <invalid-regex>, re/position("", "\\"));
+ //check-equal("atom-tan", "\<44>\<79>\<6c>\<61>\<6e>", "Dylan");
+end;
+
+// These should all compile.
+//
+define test good-regex-test ()
+ let patterns = #(
+ "",
+ "a()b",
+ "a(?#blah)b"
+ );
+ for (pattern in patterns)
+ check-no-errors(format-to-string("Regex '%s' compiles", pattern),
+ compile-regex(pattern));
+ end;
+end test good-regex-test;
+
+// All these regexes should signal <invalid-regex> on compilation.
+//
+define test bad-regex-test ()
+ let patterns = #(
+ "(?P<name>x)(?P<name>y)", // can't use same name twice
+ "(?@abc)" // invalid extended character '@'
+ );
+ for (pattern in patterns)
+ check-condition(format-to-string("Compiling '%s' gets an error", pattern),
+ <invalid-regex>,
+ compile-regex(pattern));
+ end;
+end test bad-regex-test;
+
+define test pcre-testoutput1 ()
+ run-pcre-checks(make-pcre-locator("pcre-testoutput1.txt"));
+end;
+
+define suite pcre-test-suite ()
+ test pcre-testoutput1;
+end;
+
+define test regressions-test ()
+ run-pcre-checks(make-pcre-locator("regression-tests.txt"));
+end;
define suite regular-expressions-test-suite ()
- suite old-api-test-suite;
- suite new-api-test-suite;
+ test atom-test;
+ test good-regex-test;
+ test bad-regex-test;
+ test regressions-test;
+ // For now leaving out the PCRE tests because they're too verbose.
+ // Once more basic stuff is working will start including them again.
+ // suite pcre-test-suite;
end;
+
Modified: trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid
==============================================================================
--- trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid (original)
+++ trunk/libraries/regular-expressions/tests/regular-expressions-test-suite.lid Sun Dec 2 15:16:47 2007
@@ -1,5 +1,4 @@
library: regular-expressions-test-suite
files: library
+ pcre
regular-expressions-test-suite
- old-api-test-suite
- new-api-test-suite
More information about the chatter
mailing list