Date: Sat, 30 Jan 2010 15:58:03 GMT From: Mikolaj Golub <to.my.trociny@gmail.com> To: freebsd-gnats-submit@FreeBSD.org Subject: bin/143373: [patch] awk(1) tolower/toupper functions don't support multibyte charsets Message-ID: <201001301558.o0UFw3wr013544@www.freebsd.org> Resent-Message-ID: <201001301600.o0UG06DB036273@freefall.freebsd.org>
next in thread | raw e-mail | index | archive | help
>Number: 143373 >Category: bin >Synopsis: [patch] awk(1) tolower/toupper functions don't support multibyte charsets >Confidential: no >Severity: non-critical >Priority: low >Responsible: freebsd-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Sat Jan 30 16:00:06 UTC 2010 >Closed-Date: >Last-Modified: >Originator: Mikolaj Golub >Release: 8.0-STABLE, 7.2-STABLE >Organization: >Environment: FreeBSD zhuzha.ua1 8.0-STABLE FreeBSD 8.0-STABLE #6: Sun Jan 24 21:36:17 EET 2010 root@zhuzha.ua1:/usr/obj/usr/src/sys/GENERIC i386 >Description: awk(1) tolower/toupper functions don't support multibyte charsets. This problem has already been addressed in NetBSD: http://www.netbsd.org/cgi-bin/query-pr-single.pl?number=36394 It would be nice to have this fixed in FreeBSD too. >How-To-Repeat: awk '{print tolower($0);}' awk '{print toupper($0);}' >Fix: See the attached patch adopted from NetBSD (Add support for multibyte charsets in the "tolower" and "toupper" awk functions. Code contributed by Aleksey Cheusov in PR#36394). Patch attached with submission follows: diff -ru contrib/one-true-awk.orig/proto.h contrib/one-true-awk/proto.h --- contrib/one-true-awk.orig/proto.h 2002-12-13 06:59:47.000000000 +0200 +++ contrib/one-true-awk/proto.h 2010-01-30 17:26:20.000000000 +0200 @@ -110,6 +110,7 @@ extern char *getsval(Cell *); extern char *getpssval(Cell *); /* for print */ extern char *tostring(const char *); +extern char *tostringN(const char *, size_t n); extern char *qstring(const char *, int); extern void recinit(unsigned int); Only in contrib/one-true-awk: proto.h.orig diff -ru contrib/one-true-awk.orig/run.c contrib/one-true-awk/run.c --- contrib/one-true-awk.orig/run.c 2007-06-05 18:33:51.000000000 +0300 +++ contrib/one-true-awk/run.c 2010-01-30 17:43:38.000000000 +0200 @@ -25,6 +25,8 @@ #define DEBUG #include <stdio.h> #include <ctype.h> +#include <wchar.h> +#include <wctype.h> #include <setjmp.h> #include <limits.h> #include <math.h> @@ -1466,10 +1468,12 @@ Cell *x, *y; Awkfloat u; int t; - char *p, *buf; + char *buf; Node *nextarg; FILE *fp; void flush_all(void); + char *nawk_toupper(const char *); + char *nawk_tolower(const char *); t = ptoi(a[0]); x = execute(a[1]); @@ -1521,16 +1525,10 @@ break; case FTOUPPER: case FTOLOWER: - buf = tostring(getsval(x)); - if (t == FTOUPPER) { - for (p = buf; *p; p++) - if (islower((uschar) *p)) - *p = toupper((uschar)*p); - } else { - for (p = buf; *p; p++) - if (isupper((uschar) *p)) - *p = tolower((uschar)*p); - } + if (t == FTOUPPER) + buf = nawk_toupper(getsval(x)); + else + buf = nawk_tolower(getsval(x)); tempfree(x); x = gettemp(); setsval(x, buf); @@ -1740,6 +1738,65 @@ fflush(files[i].fp); } +char *nawk_toXXX(const char *s, + int (*fun_c)(int), + wint_t (*fun_wc)(wint_t)) +{ + char *buf = NULL; + char *pbuf = NULL; + const char *ps = NULL; + size_t n = 0; + mbstate_t mbs, mbs2; + wchar_t wc; + size_t sz = MB_CUR_MAX; + + if (sz == 1) { + buf = tostring(s); + + for (pbuf = buf; *pbuf; pbuf++) + *pbuf = fun_c((uschar)*pbuf); + + return buf; + } else { + /* upper/lower character may be shorter/longer */ + buf = tostringN(s, strlen(s) * sz + 1); + + memset(&mbs, 0, sizeof(mbs)); + memset(&mbs2, 0, sizeof(mbs2)); + + ps = s; + pbuf = buf; + while (n = mbrtowc(&wc, ps, sz, &mbs), + n > 0 && n != (size_t)-1 && n != (size_t)-2) + { + ps += n; + + n = wcrtomb(pbuf, fun_wc(wc), &mbs2); + if (n == (size_t)-1 || n == (size_t)-2) + FATAL("illegal wide character %s", s); + + pbuf += n; + } + + *pbuf = 0; + + if (n) + FATAL("illegal byte sequence %s", s); + + return buf; + } +} + +char *nawk_toupper(const char *s) +{ + return nawk_toXXX(s, toupper, towupper); +} + +char *nawk_tolower(const char *s) +{ + return nawk_toXXX(s, tolower, towlower); +} + void backsub(char **pb_ptr, char **sptr_ptr); Cell *sub(Node **a, int nnn) /* substitute command */ diff -ru contrib/one-true-awk.orig/tran.c contrib/one-true-awk/tran.c --- contrib/one-true-awk.orig/tran.c 2007-10-25 15:38:02.000000000 +0300 +++ contrib/one-true-awk/tran.c 2010-01-30 17:26:20.000000000 +0200 @@ -407,6 +407,17 @@ return(p); } +char *tostringN(const char *s, size_t n) /* make a copy of string s */ +{ + char *p; + + p = malloc(n); + if (p == NULL) + FATAL("out of space in tostring on %s", s); + strcpy(p, s); + return(p); +} + char *qstring(const char *is, int delim) /* collect string up to next delim */ { const char *os = is; >Release-Note: >Audit-Trail: >Unformatted:
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201001301558.o0UFw3wr013544>