[Gd-chatter] r11646 - in trunk/libraries/strings: . tests

cgay at gwydiondylan.org cgay at gwydiondylan.org
Sun Jan 20 13:36:11 CET 2008


Author: cgay
Date: Sun Jan 20 13:36:09 2008
New Revision: 11646

Modified:
   trunk/libraries/strings/library.dylan
   trunk/libraries/strings/strings.dylan
   trunk/libraries/strings/tests/strings-test-suite.dylan
Log:
job: minor
New split implementation, in "strings" library for now.  Haven't tested
the regex code yet.

Modified: trunk/libraries/strings/library.dylan
==============================================================================
--- trunk/libraries/strings/library.dylan	(original)
+++ trunk/libraries/strings/library.dylan	Sun Jan 20 13:36:09 2008
@@ -58,7 +58,7 @@
   create
     substring,
     join,
-    split, splitf,
+    split,
     trim,
     replace,
     replace!,

Modified: trunk/libraries/strings/strings.dylan
==============================================================================
--- trunk/libraries/strings/strings.dylan	(original)
+++ trunk/libraries/strings/strings.dylan	Sun Jan 20 13:36:09 2008
@@ -42,9 +42,6 @@
 define open generic join
     (items :: <sequence>, separator :: <string>, #key) => (new-string :: <string>);
 
-define open generic splitf
-    (string :: <string>, separator :: <object>, #key) => (parts :: <sequence>);
-
 define open generic replace
     (original :: <string>, pattern :: <object>, replacement :: <string>, #key)
  => (new-string :: <string>, num-replacements :: <integer>);
@@ -817,119 +814,148 @@
 end method join;
 
 
-define function split
-    (string :: <string>,
-     #key separator,
-          start :: <integer> = 0,
-          end: _end :: <integer> = string.size,
-          max: _max :: <integer> = -1)
- => (strings :: <stretchy-object-vector>)
-  local method is-white? (index)
-          whitespace?(string[index])
-        end;
-  splitf(string, separator | is-white?, start: start, end: _end, max: _max)
-end function split;
-
-// <byte-string> separators
-define method splitf
-    (string :: <byte-string>, separator :: <byte-string>, #rest kwargs, #key)
+// In common-dylan library...
+// Split a sequence into parts at each occurrance of the 'separator'
+// and return a sequence containing the parts.  The sequence is
+// searched from beginning to end for the given 'separator' and stops
+// when it reaches the end of 'sequence' or when the size of the
+// result reaches 'count' elements.  The meaning of the 'start' and
+// 'end' parameters may differ for different methods, but the intent
+// is that it be the same as if you passed in the subsequence delimited
+// by 'start' and 'end'.  See the individual methods for details.
+//
+define generic split
+    (sequence :: <sequence>, separator :: <object>,
+     #key start :: <integer> = 0,
+          end: _end :: false-or(<integer>),
+          count :: false-or(<integer>))
+ => (parts :: <sequence>);
+
+// In common-dylan library
+
+// This is in some sense the most basic method, since others can be
+// implemented in terms of it.  The 'separator' function must accept
+// three arguments: (1) the sequence in which to search for a
+// separator, (2) the start index in that sequence at which to begin
+// searching, and (3) the index at which to stop searching, or #f to
+// search the entire sequence.  The 'separator' function must return
+// #f to indicate that no separator was found, or two values: the
+// start and end indices of the separator in the given sequence.  The
+// initial start and end indices passed to the 'separator' function
+// are the same as the 'start' and 'end' arguments passed to this
+// method.  The 'separator' function should stay within the given
+// bounds whenever possible.  (In particular it may not always be
+// possible when the separator is a regex.)
+define method split
+    (seq :: <sequence>, separator :: <function>,
+     #key start :: <integer> = 0,
+          end: _end :: false-or(<integer>),
+          count :: false-or(<integer>))
  => (parts :: <sequence>)
-  local method separator? (index)
-          values(looking-at?(separator, string, index),
-                 separator.size)
-        end;
-  apply(next-method, string, separator?, kwargs)
-end method splitf;
-
-/* todo -- <character-set>
-define method splitf
-    (string :: <byte-string>, separator :: <character-set>, #rest kwargs, #key)
+  let bpos = start;
+  let epos :: <integer> = _end | seq.size;
+  let parts = list();           // likely to be short
+  // The use of epos below is an efficiency hack, but having more than
+  // epos splits is impossible so it works.
+  let max-parts :: <integer> = count | epos;
+  let num-parts :: <integer> = 0;
+  let separator-end = #f;
+  while (bpos & bpos < epos & num-parts < max-parts)
+    let (sep-start, sep-end) = separator(seq, bpos, epos);
+    if (sep-start)
+      parts := add!(parts, copy-sequence(seq, start: bpos, end: sep-start));
+      separator-end := sep-end;
+      num-parts := num-parts + 1;
+    end;
+    bpos := sep-end;  // may be #f and terminate loop
+  end while;
+  parts := add!(parts, if (separator-end)
+                         copy-sequence(seq, start: separator-end, end: epos)
+                       else
+                         seq
+                       end);
+  reverse!(parts)
+end method split;
+
+// In common-dylan library
+// Splits seq around occurrances of the separator subsequence.
+// Works for the relatively common case where seq and separator
+// are both <string>s.
+define method split
+    (seq :: <sequence>, separator :: <sequence>,
+     #key start :: <integer> = 0,
+          end: _end :: false-or(<integer>),
+          count :: false-or(<integer>))
  => (parts :: <sequence>)
-  local method separator? (str, index)
-          member?(str[index], separator)
+  local method find-string (seq :: <sequence>,
+                            bpos :: <integer>,
+                            epos :: false-or(<integer>))
+          // Note that this only splits on the separator sequence if it is
+          // entirely contained between the start and end positions.
+          let epos :: <integer> = epos | seq.size;
+          let max-separator-start :: <integer> = epos - separator.size;
+          block (exit-loop)
+            for (seq-index from bpos below max-separator-start)
+              if (looking-at?(separator, seq, seq-index))
+                exit-loop(seq-index, seq-index + separator.size);
+              end;
+            end;
+            #f      // separator not found
+          end
         end;
-  apply(next-method, string, separator?, kwargs)
-end method splitf;
-*/
+  split(seq, find-string, start: start, end: _end, count: count);
+end method split;
 
-// <regexp> separators
-//
-// Due to limitations in the regular-expressions library a <regexp> separator
-// must be anchored (i.e., must start with ^) to be useful because there is no
-// way to request a match starting at a specific index.
-//
-// Ideally the regular-expressions library should "use strings;" and implement
-// this method.  
-/* Not yet
-define method splitf
-    (string :: <byte-string>, separator :: <regexp>, #rest kwargs, #key)
+// In common-dylan library
+// Split on a given object.
+// Covers the (<string>, <character>) case, for example.
+define method split
+    (seq :: <sequence>, separator :: <object>,
+     #key start :: <integer> = 0,
+          end: _end :: false-or(<integer>),
+          count :: false-or(<integer>))
  => (parts :: <sequence>)
-  local method separator? (index)
-          // todo -- pass end: arg to regexp-position
-          //         how to enforce the match to be anchored at index?
-          let match = regexp-position(string, separator, start: index);
-          values(match & #t,
-                 match & match.size)
-        end;
-  apply(next-method, string, separator?, kwargs)
-end method splitf;
-*/
+  local method find-pos (seq :: <sequence>,
+                         bpos :: <integer>,
+                         epos :: false-or(<integer>))
+          // Unfortunately common-dylan's position function doesn't accept
+          // start and end parameters so we have to write our own.
+          block (exit-loop)
+            for (i from bpos below epos)
+              // Should this use = or ==?
+              // How should we provide case-insensitive comparisons?
+              if (seq[i] = separator)
+                exit-loop(i, i + 1)
+              end;
+            end;
+            #f
+          end block
+        end method;
+  split(seq, find-pos, start: start, end: _end, count: count);
+end method split;
 
-// <function> separators (the most general)
-define method splitf
-    (string :: <byte-string>, separator? :: <function>,
-     #key start :: <integer> = 0, 
-          end: _end :: <integer> = string.size,
-          max: _max :: <integer> = -1)
+// In regular-expressions library
+/*
+define method split
+    (seq :: <string>, separator :: <regex>,
+     #key start :: <integer> = 0,
+          end: _end :: false-or(<integer>),
+          count :: false-or(<integer>))
  => (parts :: <sequence>)
-
-  // The separator? parameter accepts one argument, the index into the input
-  // string, and returns two values: whether or not we're looking at a
-  // separator and either #f or how long the separator is.  If the second value
-  // is #f, the separator is assumed to be of length 1.  This is presumably the
-  // common case, and this way you don't have to worry about it when writing a
-  // separator? function.
-
-  let parts :: <stretchy-vector> = make(<stretchy-vector>);
-  let bpos :: <integer> = start;
-  let curr :: <integer> = bpos;
-  let num-splits :: <integer> = 0;
-  let seen-non-separator? = #f;
-  while (curr < _end & (_max == -1 | num-splits < _max))
-    let (looking-at-separator?, sep-len) = separator?(curr);
-    if (looking-at-separator?)
-      if (seen-non-separator?)
-        add!(parts, copy-sequence(string, start: bpos, end: curr));
-        num-splits := num-splits + 1;
-      end;
-      if (sep-len)
-        // The separator function told us how big the separator is.
-        curr := curr + sep-len;
-      else
-        curr := curr + 1;
-        while (curr < _end & separator?(curr))
-          curr := curr + 1;
-        end;
-      end;
-      bpos := curr;
-    else // not looking at a separator
-      if (~seen-non-separator?)
-        // If all characters up to here have been separator characters, then
-        // they should be ignored.
-        seen-non-separator? := #t;
-        bpos := curr;
-      end;
-      curr := curr + 1;
-    end if;
-  end while;
-  // Stuff the rest of the string into the result.
-  if (bpos < string.size)
-    add!(parts, copy-sequence(string, start: bpos));
-  end;
-  parts
-end method splitf;
-
-//split("1,2,,4", separator: ",");
+  local method find-regex (seq :: <string>,
+                           bpos :: <integer>,
+                           epos :: false-or(<integer>))
+          let match = regex-search(separator, seq, start: bpos, end: epos);
+          if (match)
+            let group0 = match-group(match, 0);
+            values(group0.group-start, group0.group-end);
+          else
+            #f
+          end
+        end method find-regex;
+  split(seq, find-regex, start: start, end: _end, count: count);
+end method split;
+*/
 
 
 // todo -- should this be exported?
@@ -970,18 +996,18 @@
 define sealed method trim
     (string :: <byte-string>,
      #key test :: <function> = whitespace?,
-          from :: one-of(#"left", #"right", #"both") = #"both",
+          side :: one-of(#"left", #"right", #"both") = #"both",
           start :: <integer> = 0,
           end: _end :: <integer> = string.size)
  => (trimmed-string :: <byte-string>)
   let bpos :: <integer> = start;
   let epos :: <integer> = _end;
-  if (from == #"both" | from == #"left")
+  if (side == #"both" | side == #"left")
     while (bpos < epos & test(string[bpos]))
       bpos := bpos + 1;
     end;
   end;
-  if (from == #"both" | from == #"right")
+  if (side == #"both" | side == #"right")
     while (bpos < (epos - 1) & test(string[epos - 1]))
       epos := epos - 1;
     end;

Modified: trunk/libraries/strings/tests/strings-test-suite.dylan
==============================================================================
--- trunk/libraries/strings/tests/strings-test-suite.dylan	(original)
+++ trunk/libraries/strings/tests/strings-test-suite.dylan	Sun Jan 20 13:36:09 2008
@@ -59,8 +59,7 @@
     (<sequence>, <string>, #"key", #"conjunction") => (<string>);
 
   function split
-    (<string>, #"key", #"separator", #"start", #"end", #"trim?", #"max",
-     #"allow-empty-strings?") => (<sequence>);
+    (<string>, #"key", #"separator", #"start", #"end", #"count") => (<sequence>);
 
   open generic-function trim
     (<string>, #"key", #"test", #"side", #"start", #"end") => (<string>);
@@ -231,10 +230,41 @@
 end function-test digit-to-integer;
 
 define strings function-test split ()
-    check-equal("split empty string", #[], split(""));
-    check-equal("whitespace trimmed? 1", #[], split(" "));
-    check-equal("whitespace trimmed? 2", #["."], split(" . "));
-    check-equal("split \"a b c\"", #["a", "b", "c"], split("a b c"));
+  // Tests for basic functionality with no keyword args
+  check-equal("split empty string with another string",
+              split("", "-"),
+              #[""]);
+  check-equal("split empty sequence",
+              split(#(), #t),
+              #[#()]);
+  check-equal("basic split on string separator",
+              split("a b c", " "),
+              #["a", "b", "c"]);
+  check-equal("basic split on object separator",
+              split("a b c", ' '),
+              #["a", "b", "c"]);
+  check-equal("back-to-back separators",
+              split("a  b", ' '),
+              #["a", "", "b"]);
+  check-equal("separators on the ends",
+              split(" x ", ' '),
+              #["", "x", ""]);
+
+  // Tests for the count argument.
+  check-equal("basic count test",
+              split("a,b,c,d", ',', count: 1),
+              #["a", "b,c,d"]);
+  check-equal("basic count test",
+              split("a,b,c,d", ',', count: 2),
+              #["a", "b", "c,d"]);
+
+  // Tests for the start and end arguments
+  check-equal("basic start/end test",
+              split("a b c d", ' ', start: 1, end: 6),
+              #["", "b", "c", ""]);
+  check-equal("basic start/end test",
+              split("a b c d", ' ', start: 1),
+              #["", "b", "c", "d"]);
 end function-test split;
 
 define function replacement-test (mutating?)



More information about the chatter mailing list