Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 14 Jul 2016 09:19:54 +0000 (UTC)
From:      "Andrey A. Chernov" <ache@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r302825 - head/usr.bin/tr
Message-ID:  <201607140919.u6E9JsDO024614@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: ache
Date: Thu Jul 14 09:19:53 2016
New Revision: 302825
URL: https://svnweb.freebsd.org/changeset/base/302825

Log:
  Back out non-collating [a-z] ranges (r302594).
  Instead of changing the whole course to another POSIX-permitted way
  for consistency and uniformity I decide to completely ignore missing
  regex fucntionality and focus on fixing bugs in what we have now,
  too many small obstacles we have choicing other way, counting ports.
  Corresponding libc changes are backed out in r302824.

Modified:
  head/usr.bin/tr/str.c
  head/usr.bin/tr/tr.1
  head/usr.bin/tr/tr.c

Modified: head/usr.bin/tr/str.c
==============================================================================
--- head/usr.bin/tr/str.c	Thu Jul 14 09:07:25 2016	(r302824)
+++ head/usr.bin/tr/str.c	Thu Jul 14 09:19:53 2016	(r302825)
@@ -53,7 +53,7 @@ static int      backslash(STR *, int *);
 static int	bracket(STR *);
 static void	genclass(STR *);
 static void	genequiv(STR *);
-static int	genrange(STR *);
+static int      genrange(STR *, int);
 static void	genseq(STR *);
 
 wint_t
@@ -93,7 +93,7 @@ next(STR *s)
 		}
 
 		/* We can start a range at any time. */
-		if (s->str[0] == '-' && genrange(s))
+		if (s->str[0] == '-' && genrange(s, is_octal))
 			return (next(s));
 		return (1);
 	case RANGE:
@@ -237,16 +237,18 @@ genequiv(STR *s)
 }
 
 static int
-genrange(STR *s)
+genrange(STR *s, int was_octal)
 {
-	int stopval;
+	int stopval, octal;
 	char *savestart;
+	int n, cnt, *p;
 	size_t clen;
 	wchar_t wc;
 
+	octal = 0;
 	savestart = s->str;
 	if (*++s->str == '\\')
-		stopval = backslash(s, NULL);
+		stopval = backslash(s, &octal);
 	else {
 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
 		if (clen == (size_t)-1 || clen == (size_t)-2)
@@ -254,13 +256,37 @@ genrange(STR *s)
 		stopval = wc;
 		s->str += clen;
 	}
-	if (stopval < s->lastch) {
+	/*
+	 * XXX Characters are not ordered according to collating sequence in
+	 * multibyte locales.
+	 */
+	if (octal || was_octal || MB_CUR_MAX > 1) {
+		if (stopval < s->lastch) {
+			s->str = savestart;
+			return (0);
+		}
+		s->cnt = stopval - s->lastch + 1;
+		s->state = RANGE;
+		--s->lastch;
+		return (1);
+	}
+	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
 		s->str = savestart;
 		return (0);
 	}
-	s->cnt = stopval - s->lastch + 1;
-	s->state = RANGE;
-	--s->lastch;
+	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
+		err(1, "genrange() malloc");
+	for (cnt = 0; cnt < NCHARS_SB; cnt++)
+		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
+		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
+			*p++ = cnt;
+	*p = OOBCH;
+	n = p - s->set;
+
+	s->cnt = 0;
+	s->state = SET;
+	if (n > 1)
+		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
 	return (1);
 }
 

Modified: head/usr.bin/tr/tr.1
==============================================================================
--- head/usr.bin/tr/tr.1	Thu Jul 14 09:07:25 2016	(r302824)
+++ head/usr.bin/tr/tr.1	Thu Jul 14 09:19:53 2016	(r302825)
@@ -164,6 +164,14 @@ as defined by the collation sequence.
 If either or both of the range endpoints are octal sequences, it
 represents the range of specific coded values between the
 range endpoints, inclusive.
+.Pp
+.Bf Em
+See the
+.Sx COMPATIBILITY
+section below for an important note regarding
+differences in the way the current
+implementation interprets range expressions differently from
+previous implementations.
 .Ef
 .It [:class:]
 Represents all characters belonging to the defined character class.
@@ -299,16 +307,22 @@ Remove diacritical marks from all accent
 .Pp
 .Dl "tr \*q[=e=]\*q \*qe\*q"
 .Sh COMPATIBILITY
+Previous
 .Fx
 implementations of
 .Nm
 did not order characters in range expressions according to the current
-locale's collation order, making it possible to convert accented Latin
-characters from upper to lower case using
+locale's collation order, making it possible to convert unaccented Latin
+characters (esp.\& as found in English text) from upper to lower case using
 the traditional
 .Ux
 idiom of
 .Dq Li "tr A-Z a-z" .
+Since
+.Nm
+now obeys the locale's collation order, this idiom may not produce
+correct results when there is not a 1:1 mapping between lower and
+upper case, or when the order of characters within the two cases differs.
 As noted in the
 .Sx EXAMPLES
 section above, the character class expressions
@@ -320,9 +334,6 @@ should be used instead of explicit chara
 and
 .Dq Li A-Z .
 .Pp
-.Dq Li [=equiv=]
-expression is implemented for single byte locales only.
-.Pp
 System V has historically implemented character ranges using the syntax
 .Dq Li [c-c]
 instead of the

Modified: head/usr.bin/tr/tr.c
==============================================================================
--- head/usr.bin/tr/tr.c	Thu Jul 14 09:07:25 2016	(r302824)
+++ head/usr.bin/tr/tr.c	Thu Jul 14 09:19:53 2016	(r302825)
@@ -68,8 +68,10 @@ static void usage(void);
 int
 main(int argc, char **argv)
 {
+	static int carray[NCHARS_SB];
 	struct cmap *map;
 	struct cset *delete, *squeeze;
+	int n, *p;
 	int Cflag, cflag, dflag, sflag, isstring2;
 	wint_t ch, cnt, lastch;
 
@@ -252,7 +254,7 @@ main(int argc, char **argv)
 		(void)next(&s2);
 	}
 endloop:
-	if (cflag || Cflag) {
+	if (cflag || (Cflag && MB_CUR_MAX > 1)) {
 		/*
 		 * This is somewhat tricky: since the character set is
 		 * potentially huge, we need to avoid allocating a map
@@ -270,11 +272,10 @@ endloop:
 			if (Cflag && !iswrune(cnt))
 				continue;
 			if (cmap_lookup(map, cnt) == OOBCH) {
-				if (next(&s2)) {
+				if (next(&s2))
 					cmap_add(map, cnt, s2.lastch);
-					if (sflag)
-						cset_add(squeeze, s2.lastch);
-				}
+				if (sflag)
+					cset_add(squeeze, s2.lastch);
 			} else
 				cmap_add(map, cnt, cnt);
 			if ((s2.state == EOS || s2.state == INFINITE) &&
@@ -282,6 +283,30 @@ endloop:
 				break;
 		}
 		cmap_default(map, s2.lastch);
+	} else if (Cflag) {
+		for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
+			if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
+				*p++ = cnt;
+			else
+				cmap_add(map, cnt, cnt);
+		}
+		n = p - carray;
+		if (Cflag && n > 1)
+			(void)mergesort(carray, n, sizeof(*carray), charcoll);
+
+		s2.str = argv[1];
+		s2.state = NORMAL;
+		for (cnt = 0; cnt < n; cnt++) {
+			(void)next(&s2);
+			cmap_add(map, carray[cnt], s2.lastch);
+			/*
+			 * Chars taken from s2 can be different this time
+			 * due to lack of complex upper/lower processing,
+			 * so fill string2 again to not miss some.
+			 */
+			if (sflag)
+				cset_add(squeeze, s2.lastch);
+		}
 	}
 
 	cset_cache(squeeze);
@@ -326,6 +351,16 @@ setup(char *arg, STR *str, int cflag, in
 	return (cs);
 }
 
+int
+charcoll(const void *a, const void *b)
+{
+	static char sa[2], sb[2];
+
+	sa[0] = *(const int *)a;
+	sb[0] = *(const int *)b;
+	return (strcoll(sa, sb));
+}
+
 static void
 usage(void)
 {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201607140919.u6E9JsDO024614>