Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 30 Jan 2010 15:58:03 GMT
From:      Mikolaj Golub <to.my.trociny@gmail.com>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   bin/143373: [patch] awk(1) tolower/toupper functions don't support multibyte charsets
Message-ID:  <201001301558.o0UFw3wr013544@www.freebsd.org>
Resent-Message-ID: <201001301600.o0UG06DB036273@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         143373
>Category:       bin
>Synopsis:       [patch] awk(1) tolower/toupper functions don't support multibyte charsets
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Sat Jan 30 16:00:06 UTC 2010
>Closed-Date:
>Last-Modified:
>Originator:     Mikolaj Golub
>Release:        8.0-STABLE, 7.2-STABLE
>Organization:
>Environment:
FreeBSD zhuzha.ua1 8.0-STABLE FreeBSD 8.0-STABLE #6: Sun Jan 24 21:36:17 EET 2010     root@zhuzha.ua1:/usr/obj/usr/src/sys/GENERIC  i386
>Description:
awk(1) tolower/toupper functions don't support multibyte charsets. This problem has already been addressed in NetBSD:

http://www.netbsd.org/cgi-bin/query-pr-single.pl?number=36394

It would be nice to have this fixed in FreeBSD too.
>How-To-Repeat:
awk '{print tolower($0);}'
awk '{print toupper($0);}'
>Fix:
See the attached patch adopted from NetBSD (Add support for multibyte charsets in the "tolower" and "toupper" awk functions. Code contributed by Aleksey Cheusov in PR#36394).

Patch attached with submission follows:

diff -ru contrib/one-true-awk.orig/proto.h contrib/one-true-awk/proto.h
--- contrib/one-true-awk.orig/proto.h	2002-12-13 06:59:47.000000000 +0200
+++ contrib/one-true-awk/proto.h	2010-01-30 17:26:20.000000000 +0200
@@ -110,6 +110,7 @@
 extern	char	*getsval(Cell *);
 extern	char	*getpssval(Cell *);     /* for print */
 extern	char	*tostring(const char *);
+extern	char	*tostringN(const char *, size_t n);
 extern	char	*qstring(const char *, int);
 
 extern	void	recinit(unsigned int);
Only in contrib/one-true-awk: proto.h.orig
diff -ru contrib/one-true-awk.orig/run.c contrib/one-true-awk/run.c
--- contrib/one-true-awk.orig/run.c	2007-06-05 18:33:51.000000000 +0300
+++ contrib/one-true-awk/run.c	2010-01-30 17:43:38.000000000 +0200
@@ -25,6 +25,8 @@
 #define DEBUG
 #include <stdio.h>
 #include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
 #include <setjmp.h>
 #include <limits.h>
 #include <math.h>
@@ -1466,10 +1468,12 @@
 	Cell *x, *y;
 	Awkfloat u;
 	int t;
-	char *p, *buf;
+	char *buf;
 	Node *nextarg;
 	FILE *fp;
 	void flush_all(void);
+	char *nawk_toupper(const char *);
+	char *nawk_tolower(const char *);
 
 	t = ptoi(a[0]);
 	x = execute(a[1]);
@@ -1521,16 +1525,10 @@
 		break;
 	case FTOUPPER:
 	case FTOLOWER:
-		buf = tostring(getsval(x));
-		if (t == FTOUPPER) {
-			for (p = buf; *p; p++)
-				if (islower((uschar) *p))
-					*p = toupper((uschar)*p);
-		} else {
-			for (p = buf; *p; p++)
-				if (isupper((uschar) *p))
-					*p = tolower((uschar)*p);
-		}
+		if (t == FTOUPPER)
+			buf = nawk_toupper(getsval(x));
+		else
+			buf = nawk_tolower(getsval(x));
 		tempfree(x);
 		x = gettemp();
 		setsval(x, buf);
@@ -1740,6 +1738,65 @@
 			fflush(files[i].fp);
 }
 
+char *nawk_toXXX(const char *s,
+			int (*fun_c)(int),
+			wint_t (*fun_wc)(wint_t))
+{
+	char *buf      = NULL;
+	char *pbuf     = NULL;
+	const char *ps = NULL;
+	size_t n       = 0;
+	mbstate_t mbs, mbs2;
+	wchar_t wc;
+	size_t sz = MB_CUR_MAX;
+
+	if (sz == 1) {
+		buf = tostring(s);
+
+		for (pbuf = buf; *pbuf; pbuf++)
+			*pbuf = fun_c((uschar)*pbuf);
+
+		return buf;
+	} else {
+		/* upper/lower character may be shorter/longer */
+		buf = tostringN(s, strlen(s) * sz + 1);
+
+		memset(&mbs,  0, sizeof(mbs));
+		memset(&mbs2, 0, sizeof(mbs2));
+
+		ps   = s;
+		pbuf = buf;
+		while (n = mbrtowc(&wc, ps, sz, &mbs),
+		       n > 0 && n != (size_t)-1 && n != (size_t)-2)
+		{
+			ps += n;
+
+			n = wcrtomb(pbuf, fun_wc(wc), &mbs2);
+			if (n == (size_t)-1 || n == (size_t)-2)
+				FATAL("illegal wide character %s", s);
+
+			pbuf += n;
+		}
+
+		*pbuf = 0;
+
+		if (n)
+			FATAL("illegal byte sequence %s", s);
+
+		return buf;
+	}
+}
+
+char *nawk_toupper(const char *s)
+{
+	return nawk_toXXX(s, toupper, towupper);
+}
+
+char *nawk_tolower(const char *s)
+{
+	return nawk_toXXX(s, tolower, towlower);
+}
+
 void backsub(char **pb_ptr, char **sptr_ptr);
 
 Cell *sub(Node **a, int nnn)	/* substitute command */
diff -ru contrib/one-true-awk.orig/tran.c contrib/one-true-awk/tran.c
--- contrib/one-true-awk.orig/tran.c	2007-10-25 15:38:02.000000000 +0300
+++ contrib/one-true-awk/tran.c	2010-01-30 17:26:20.000000000 +0200
@@ -407,6 +407,17 @@
 	return(p);
 }
 
+char *tostringN(const char *s, size_t n)	/* make a copy of string s */
+{
+	char *p;
+
+	p = malloc(n);
+	if (p == NULL)
+		FATAL("out of space in tostring on %s", s);
+	strcpy(p, s);
+	return(p);
+}
+
 char *qstring(const char *is, int delim)	/* collect string up to next delim */
 {
 	const char *os = is;


>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201001301558.o0UFw3wr013544>