[Gd-chatter] r11646 - in trunk/libraries/strings: . tests
cgay at gwydiondylan.org
cgay at gwydiondylan.org
Sun Jan 20 13:36:11 CET 2008
Author: cgay
Date: Sun Jan 20 13:36:09 2008
New Revision: 11646
Modified:
trunk/libraries/strings/library.dylan
trunk/libraries/strings/strings.dylan
trunk/libraries/strings/tests/strings-test-suite.dylan
Log:
job: minor
New split implementation, in "strings" library for now. Haven't tested
the regex code yet.
Modified: trunk/libraries/strings/library.dylan
==============================================================================
--- trunk/libraries/strings/library.dylan (original)
+++ trunk/libraries/strings/library.dylan Sun Jan 20 13:36:09 2008
@@ -58,7 +58,7 @@
create
substring,
join,
- split, splitf,
+ split,
trim,
replace,
replace!,
Modified: trunk/libraries/strings/strings.dylan
==============================================================================
--- trunk/libraries/strings/strings.dylan (original)
+++ trunk/libraries/strings/strings.dylan Sun Jan 20 13:36:09 2008
@@ -42,9 +42,6 @@
define open generic join
(items :: <sequence>, separator :: <string>, #key) => (new-string :: <string>);
-define open generic splitf
- (string :: <string>, separator :: <object>, #key) => (parts :: <sequence>);
-
define open generic replace
(original :: <string>, pattern :: <object>, replacement :: <string>, #key)
=> (new-string :: <string>, num-replacements :: <integer>);
@@ -817,119 +814,148 @@
end method join;
-define function split
- (string :: <string>,
- #key separator,
- start :: <integer> = 0,
- end: _end :: <integer> = string.size,
- max: _max :: <integer> = -1)
- => (strings :: <stretchy-object-vector>)
- local method is-white? (index)
- whitespace?(string[index])
- end;
- splitf(string, separator | is-white?, start: start, end: _end, max: _max)
-end function split;
-
-// <byte-string> separators
-define method splitf
- (string :: <byte-string>, separator :: <byte-string>, #rest kwargs, #key)
+// In common-dylan library...
+// Split a sequence into parts at each occurrance of the 'separator'
+// and return a sequence containing the parts. The sequence is
+// searched from beginning to end for the given 'separator' and stops
+// when it reaches the end of 'sequence' or when the size of the
+// result reaches 'count' elements. The meaning of the 'start' and
+// 'end' parameters may differ for different methods, but the intent
+// is that it be the same as if you passed in the subsequence delimited
+// by 'start' and 'end'. See the individual methods for details.
+//
+define generic split
+ (sequence :: <sequence>, separator :: <object>,
+ #key start :: <integer> = 0,
+ end: _end :: false-or(<integer>),
+ count :: false-or(<integer>))
+ => (parts :: <sequence>);
+
+// In common-dylan library
+
+// This is in some sense the most basic method, since others can be
+// implemented in terms of it. The 'separator' function must accept
+// three arguments: (1) the sequence in which to search for a
+// separator, (2) the start index in that sequence at which to begin
+// searching, and (3) the index at which to stop searching, or #f to
+// search the entire sequence. The 'separator' function must return
+// #f to indicate that no separator was found, or two values: the
+// start and end indices of the separator in the given sequence. The
+// initial start and end indices passed to the 'separator' function
+// are the same as the 'start' and 'end' arguments passed to this
+// method. The 'separator' function should stay within the given
+// bounds whenever possible. (In particular it may not always be
+// possible when the separator is a regex.)
+define method split
+ (seq :: <sequence>, separator :: <function>,
+ #key start :: <integer> = 0,
+ end: _end :: false-or(<integer>),
+ count :: false-or(<integer>))
=> (parts :: <sequence>)
- local method separator? (index)
- values(looking-at?(separator, string, index),
- separator.size)
- end;
- apply(next-method, string, separator?, kwargs)
-end method splitf;
-
-/* todo -- <character-set>
-define method splitf
- (string :: <byte-string>, separator :: <character-set>, #rest kwargs, #key)
+ let bpos = start;
+ let epos :: <integer> = _end | seq.size;
+ let parts = list(); // likely to be short
+ // The use of epos below is an efficiency hack, but having more than
+ // epos splits is impossible so it works.
+ let max-parts :: <integer> = count | epos;
+ let num-parts :: <integer> = 0;
+ let separator-end = #f;
+ while (bpos & bpos < epos & num-parts < max-parts)
+ let (sep-start, sep-end) = separator(seq, bpos, epos);
+ if (sep-start)
+ parts := add!(parts, copy-sequence(seq, start: bpos, end: sep-start));
+ separator-end := sep-end;
+ num-parts := num-parts + 1;
+ end;
+ bpos := sep-end; // may be #f and terminate loop
+ end while;
+ parts := add!(parts, if (separator-end)
+ copy-sequence(seq, start: separator-end, end: epos)
+ else
+ seq
+ end);
+ reverse!(parts)
+end method split;
+
+// In common-dylan library
+// Splits seq around occurrances of the separator subsequence.
+// Works for the relatively common case where seq and separator
+// are both <string>s.
+define method split
+ (seq :: <sequence>, separator :: <sequence>,
+ #key start :: <integer> = 0,
+ end: _end :: false-or(<integer>),
+ count :: false-or(<integer>))
=> (parts :: <sequence>)
- local method separator? (str, index)
- member?(str[index], separator)
+ local method find-string (seq :: <sequence>,
+ bpos :: <integer>,
+ epos :: false-or(<integer>))
+ // Note that this only splits on the separator sequence if it is
+ // entirely contained between the start and end positions.
+ let epos :: <integer> = epos | seq.size;
+ let max-separator-start :: <integer> = epos - separator.size;
+ block (exit-loop)
+ for (seq-index from bpos below max-separator-start)
+ if (looking-at?(separator, seq, seq-index))
+ exit-loop(seq-index, seq-index + separator.size);
+ end;
+ end;
+ #f // separator not found
+ end
end;
- apply(next-method, string, separator?, kwargs)
-end method splitf;
-*/
+ split(seq, find-string, start: start, end: _end, count: count);
+end method split;
-// <regexp> separators
-//
-// Due to limitations in the regular-expressions library a <regexp> separator
-// must be anchored (i.e., must start with ^) to be useful because there is no
-// way to request a match starting at a specific index.
-//
-// Ideally the regular-expressions library should "use strings;" and implement
-// this method.
-/* Not yet
-define method splitf
- (string :: <byte-string>, separator :: <regexp>, #rest kwargs, #key)
+// In common-dylan library
+// Split on a given object.
+// Covers the (<string>, <character>) case, for example.
+define method split
+ (seq :: <sequence>, separator :: <object>,
+ #key start :: <integer> = 0,
+ end: _end :: false-or(<integer>),
+ count :: false-or(<integer>))
=> (parts :: <sequence>)
- local method separator? (index)
- // todo -- pass end: arg to regexp-position
- // how to enforce the match to be anchored at index?
- let match = regexp-position(string, separator, start: index);
- values(match & #t,
- match & match.size)
- end;
- apply(next-method, string, separator?, kwargs)
-end method splitf;
-*/
+ local method find-pos (seq :: <sequence>,
+ bpos :: <integer>,
+ epos :: false-or(<integer>))
+ // Unfortunately common-dylan's position function doesn't accept
+ // start and end parameters so we have to write our own.
+ block (exit-loop)
+ for (i from bpos below epos)
+ // Should this use = or ==?
+ // How should we provide case-insensitive comparisons?
+ if (seq[i] = separator)
+ exit-loop(i, i + 1)
+ end;
+ end;
+ #f
+ end block
+ end method;
+ split(seq, find-pos, start: start, end: _end, count: count);
+end method split;
-// <function> separators (the most general)
-define method splitf
- (string :: <byte-string>, separator? :: <function>,
- #key start :: <integer> = 0,
- end: _end :: <integer> = string.size,
- max: _max :: <integer> = -1)
+// In regular-expressions library
+/*
+define method split
+ (seq :: <string>, separator :: <regex>,
+ #key start :: <integer> = 0,
+ end: _end :: false-or(<integer>),
+ count :: false-or(<integer>))
=> (parts :: <sequence>)
-
- // The separator? parameter accepts one argument, the index into the input
- // string, and returns two values: whether or not we're looking at a
- // separator and either #f or how long the separator is. If the second value
- // is #f, the separator is assumed to be of length 1. This is presumably the
- // common case, and this way you don't have to worry about it when writing a
- // separator? function.
-
- let parts :: <stretchy-vector> = make(<stretchy-vector>);
- let bpos :: <integer> = start;
- let curr :: <integer> = bpos;
- let num-splits :: <integer> = 0;
- let seen-non-separator? = #f;
- while (curr < _end & (_max == -1 | num-splits < _max))
- let (looking-at-separator?, sep-len) = separator?(curr);
- if (looking-at-separator?)
- if (seen-non-separator?)
- add!(parts, copy-sequence(string, start: bpos, end: curr));
- num-splits := num-splits + 1;
- end;
- if (sep-len)
- // The separator function told us how big the separator is.
- curr := curr + sep-len;
- else
- curr := curr + 1;
- while (curr < _end & separator?(curr))
- curr := curr + 1;
- end;
- end;
- bpos := curr;
- else // not looking at a separator
- if (~seen-non-separator?)
- // If all characters up to here have been separator characters, then
- // they should be ignored.
- seen-non-separator? := #t;
- bpos := curr;
- end;
- curr := curr + 1;
- end if;
- end while;
- // Stuff the rest of the string into the result.
- if (bpos < string.size)
- add!(parts, copy-sequence(string, start: bpos));
- end;
- parts
-end method splitf;
-
-//split("1,2,,4", separator: ",");
+ local method find-regex (seq :: <string>,
+ bpos :: <integer>,
+ epos :: false-or(<integer>))
+ let match = regex-search(separator, seq, start: bpos, end: epos);
+ if (match)
+ let group0 = match-group(match, 0);
+ values(group0.group-start, group0.group-end);
+ else
+ #f
+ end
+ end method find-regex;
+ split(seq, find-regex, start: start, end: _end, count: count);
+end method split;
+*/
// todo -- should this be exported?
@@ -970,18 +996,18 @@
define sealed method trim
(string :: <byte-string>,
#key test :: <function> = whitespace?,
- from :: one-of(#"left", #"right", #"both") = #"both",
+ side :: one-of(#"left", #"right", #"both") = #"both",
start :: <integer> = 0,
end: _end :: <integer> = string.size)
=> (trimmed-string :: <byte-string>)
let bpos :: <integer> = start;
let epos :: <integer> = _end;
- if (from == #"both" | from == #"left")
+ if (side == #"both" | side == #"left")
while (bpos < epos & test(string[bpos]))
bpos := bpos + 1;
end;
end;
- if (from == #"both" | from == #"right")
+ if (side == #"both" | side == #"right")
while (bpos < (epos - 1) & test(string[epos - 1]))
epos := epos - 1;
end;
Modified: trunk/libraries/strings/tests/strings-test-suite.dylan
==============================================================================
--- trunk/libraries/strings/tests/strings-test-suite.dylan (original)
+++ trunk/libraries/strings/tests/strings-test-suite.dylan Sun Jan 20 13:36:09 2008
@@ -59,8 +59,7 @@
(<sequence>, <string>, #"key", #"conjunction") => (<string>);
function split
- (<string>, #"key", #"separator", #"start", #"end", #"trim?", #"max",
- #"allow-empty-strings?") => (<sequence>);
+ (<string>, #"key", #"separator", #"start", #"end", #"count") => (<sequence>);
open generic-function trim
(<string>, #"key", #"test", #"side", #"start", #"end") => (<string>);
@@ -231,10 +230,41 @@
end function-test digit-to-integer;
define strings function-test split ()
- check-equal("split empty string", #[], split(""));
- check-equal("whitespace trimmed? 1", #[], split(" "));
- check-equal("whitespace trimmed? 2", #["."], split(" . "));
- check-equal("split \"a b c\"", #["a", "b", "c"], split("a b c"));
+ // Tests for basic functionality with no keyword args
+ check-equal("split empty string with another string",
+ split("", "-"),
+ #[""]);
+ check-equal("split empty sequence",
+ split(#(), #t),
+ #[#()]);
+ check-equal("basic split on string separator",
+ split("a b c", " "),
+ #["a", "b", "c"]);
+ check-equal("basic split on object separator",
+ split("a b c", ' '),
+ #["a", "b", "c"]);
+ check-equal("back-to-back separators",
+ split("a b", ' '),
+ #["a", "", "b"]);
+ check-equal("separators on the ends",
+ split(" x ", ' '),
+ #["", "x", ""]);
+
+ // Tests for the count argument.
+ check-equal("basic count test",
+ split("a,b,c,d", ',', count: 1),
+ #["a", "b,c,d"]);
+ check-equal("basic count test",
+ split("a,b,c,d", ',', count: 2),
+ #["a", "b", "c,d"]);
+
+ // Tests for the start and end arguments
+ check-equal("basic start/end test",
+ split("a b c d", ' ', start: 1, end: 6),
+ #["", "b", "c", ""]);
+ check-equal("basic start/end test",
+ split("a b c d", ' ', start: 1),
+ #["", "b", "c", "d"]);
end function-test split;
define function replacement-test (mutating?)
More information about the chatter
mailing list