From owner-svn-src-all@FreeBSD.ORG Sun May 8 11:32:20 2011 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 6EF92106566C; Sun, 8 May 2011 11:32:20 +0000 (UTC) (envelope-from jilles@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 6098B8FC0A; Sun, 8 May 2011 11:32:20 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p48BWKV2043928; Sun, 8 May 2011 11:32:20 GMT (envelope-from jilles@svn.freebsd.org) Received: (from jilles@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id p48BWKEk043924; Sun, 8 May 2011 11:32:20 GMT (envelope-from jilles@svn.freebsd.org) Message-Id: <201105081132.p48BWKEk043924@svn.freebsd.org> From: Jilles Tjoelker Date: Sun, 8 May 2011 11:32:20 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r221646 - in head: bin/sh tools/regression/bin/sh/builtins tools/regression/bin/sh/expansion X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 08 May 2011 11:32:20 -0000 Author: jilles Date: Sun May 8 11:32:20 2011 New Revision: 221646 URL: http://svn.freebsd.org/changeset/base/221646 Log: sh: Add UTF-8 support to pattern matching. ?, [...] patterns match codepoints instead of bytes. They do not match invalid sequences. [...] patterns must not contain invalid sequences otherwise they will not match anything. This is so that ${var#?} removes the first codepoint, not the first byte, without putting UTF-8 knowledge into the ${var#pattern} code. However, * continues to match any string and an invalid sequence matches an identical invalid sequence. (This differs from fnmatch(3).) Added: head/tools/regression/bin/sh/builtins/case5.0 (contents, props changed) head/tools/regression/bin/sh/expansion/trim8.0 (contents, props changed) Modified: head/bin/sh/expand.c Modified: head/bin/sh/expand.c ============================================================================== --- head/bin/sh/expand.c Sun May 8 11:20:27 2011 (r221645) +++ head/bin/sh/expand.c Sun May 8 11:32:20 2011 (r221646) @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include /* * Routines to expand arguments to commands. We have to deal with @@ -111,16 +112,16 @@ static void addfname(char *); static struct strlist *expsort(struct strlist *); static struct strlist *msort(struct strlist *, int); static char *cvtnum(int, char *); -static int collate_range_cmp(int, int); +static int collate_range_cmp(wchar_t, wchar_t); static int -collate_range_cmp(int c1, int c2) +collate_range_cmp(wchar_t c1, wchar_t c2) { - static char s1[2], s2[2]; + static wchar_t s1[2], s2[2]; s1[0] = c1; s2[0] = c2; - return (strcoll(s1, s2)); + return (wcscoll(s1, s2)); } /* @@ -1377,6 +1378,23 @@ msort(struct strlist *list, int len) +static wchar_t +get_wc(const char **p) +{ + wchar_t c; + int chrlen; + + chrlen = mbtowc(&c, *p, 4); + if (chrlen == 0) + return 0; + else if (chrlen == -1) + c = 0; + else + *p += chrlen; + return c; +} + + /* * Returns true if the pattern matches the string. */ @@ -1386,6 +1404,7 @@ patmatch(const char *pattern, const char { const char *p, *q; char c; + wchar_t wc, wc2; p = pattern; q = string; @@ -1404,7 +1423,11 @@ patmatch(const char *pattern, const char case '?': if (squoted && *q == CTLESC) q++; - if (*q++ == '\0') + if (localeisutf8) + wc = get_wc(&q); + else + wc = *q++; + if (wc == '\0') return 0; break; case '*': @@ -1434,7 +1457,7 @@ patmatch(const char *pattern, const char case '[': { const char *endp; int invert, found; - char chr; + wchar_t chr; endp = p; if (*endp == '!' || *endp == '^') @@ -1455,8 +1478,11 @@ patmatch(const char *pattern, const char p++; } found = 0; - chr = *q++; - if (squoted && chr == CTLESC) + if (squoted && *q == CTLESC) + q++; + if (localeisutf8) + chr = get_wc(&q); + else chr = *q++; if (chr == '\0') return 0; @@ -1466,19 +1492,31 @@ patmatch(const char *pattern, const char continue; if (c == CTLESC) c = *p++; + if (localeisutf8 && c & 0x80) { + p--; + wc = get_wc(&p); + if (wc == 0) /* bad utf-8 */ + return 0; + } else + wc = c; if (*p == '-' && p[1] != ']') { p++; while (*p == CTLQUOTEMARK) p++; if (*p == CTLESC) p++; - if ( collate_range_cmp(chr, c) >= 0 - && collate_range_cmp(chr, *p) <= 0 + if (localeisutf8) { + wc2 = get_wc(&p); + if (wc2 == 0) /* bad utf-8 */ + return 0; + } else + wc2 = *p++; + if ( collate_range_cmp(chr, wc) >= 0 + && collate_range_cmp(chr, wc2) <= 0 ) found = 1; - p++; } else { - if (chr == c) + if (chr == wc) found = 1; } } while ((c = *p++) != ']'); Added: head/tools/regression/bin/sh/builtins/case5.0 ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/tools/regression/bin/sh/builtins/case5.0 Sun May 8 11:32:20 2011 (r221646) @@ -0,0 +1,57 @@ +# $FreeBSD$ + +unset LC_ALL +LC_CTYPE=en_US.UTF-8 +export LC_CTYPE + +c1=e +# a umlaut +c2=$(printf '\303\244') +# euro sign +c3=$(printf '\342\202\254') +# some sort of 't' outside BMP +c4=$(printf '\360\235\225\245') + +ok=0 +case $c1$c2$c3$c4 in +*) ok=1 ;; +esac +if [ $ok = 0 ]; then + echo wrong at $LINENO + exit 3 +fi + +case $c1$c2$c3$c4 in +$c1$c2$c3$c4) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +"$c1$c2$c3$c4") ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +????) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1.$c2.$c3.$c4 in +?.?.?.?) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +[!a][!b][!c][!d]) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +[$c1][$c2][$c3][$c4]) ;; +*) echo wrong at $LINENO ;; +esac + +case $c1$c2$c3$c4 in +["$c1"]["$c2"]["$c3"]["$c4"]) ;; +*) echo wrong at $LINENO ;; +esac Added: head/tools/regression/bin/sh/expansion/trim8.0 ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/tools/regression/bin/sh/expansion/trim8.0 Sun May 8 11:32:20 2011 (r221646) @@ -0,0 +1,75 @@ +# $FreeBSD$ + +unset LC_ALL +LC_CTYPE=en_US.UTF-8 +export LC_CTYPE + +c1=e +# a umlaut +c2=$(printf '\303\244') +# euro sign +c3=$(printf '\342\202\254') +# some sort of 't' outside BMP +c4=$(printf '\360\235\225\245') + +s=$c1$c2$c3$c4 + +testcase() { + code="$1" + expected="$2" + oIFS="$IFS" + eval "$code" + IFS='|' + result="$#|$*" + IFS="$oIFS" + if [ "x$result" = "x$expected" ]; then + ok=x$ok + else + failures=x$failures + echo "For $code, expected $expected actual $result" + fi +} + +testcase 'set -- "$s"' "1|$s" +testcase 'set -- "${s#$c2}"' "1|$s" +testcase 'set -- "${s#*}"' "1|$s" +testcase 'set -- "${s#$c1}"' "1|$c2$c3$c4" +testcase 'set -- "${s#$c1$c2}"' "1|$c3$c4" +testcase 'set -- "${s#$c1$c2$c3}"' "1|$c4" +testcase 'set -- "${s#$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s#?}"' "1|$c2$c3$c4" +testcase 'set -- "${s#??}"' "1|$c3$c4" +testcase 'set -- "${s#???}"' "1|$c4" +testcase 'set -- "${s#????}"' "1|" +testcase 'set -- "${s#*$c3}"' "1|$c4" +testcase 'set -- "${s%$c4}"' "1|$c1$c2$c3" +testcase 'set -- "${s%$c3$c4}"' "1|$c1$c2" +testcase 'set -- "${s%$c2$c3$c4}"' "1|$c1" +testcase 'set -- "${s%$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s%?}"' "1|$c1$c2$c3" +testcase 'set -- "${s%??}"' "1|$c1$c2" +testcase 'set -- "${s%???}"' "1|$c1" +testcase 'set -- "${s%????}"' "1|" +testcase 'set -- "${s%$c2*}"' "1|$c1" +testcase 'set -- "${s##$c2}"' "1|$s" +testcase 'set -- "${s##*}"' "1|" +testcase 'set -- "${s##$c1}"' "1|$c2$c3$c4" +testcase 'set -- "${s##$c1$c2}"' "1|$c3$c4" +testcase 'set -- "${s##$c1$c2$c3}"' "1|$c4" +testcase 'set -- "${s##$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s##?}"' "1|$c2$c3$c4" +testcase 'set -- "${s##??}"' "1|$c3$c4" +testcase 'set -- "${s##???}"' "1|$c4" +testcase 'set -- "${s##????}"' "1|" +testcase 'set -- "${s##*$c3}"' "1|$c4" +testcase 'set -- "${s%%$c4}"' "1|$c1$c2$c3" +testcase 'set -- "${s%%$c3$c4}"' "1|$c1$c2" +testcase 'set -- "${s%%$c2$c3$c4}"' "1|$c1" +testcase 'set -- "${s%%$c1$c2$c3$c4}"' "1|" +testcase 'set -- "${s%%?}"' "1|$c1$c2$c3" +testcase 'set -- "${s%%??}"' "1|$c1$c2" +testcase 'set -- "${s%%???}"' "1|$c1" +testcase 'set -- "${s%%????}"' "1|" +testcase 'set -- "${s%%$c2*}"' "1|$c1" + +test "x$failures" = x