From owner-freebsd-current@FreeBSD.ORG Wed Sep 19 12:10:30 2007 Return-Path: Delivered-To: current@FreeBSD.ORG Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 01D9716A41B; Wed, 19 Sep 2007 12:10:30 +0000 (UTC) (envelope-from ache@nagual.pp.ru) Received: from nagual.pp.ru (nagual.pp.ru [194.87.13.69]) by mx1.freebsd.org (Postfix) with ESMTP id 27F7313C48A; Wed, 19 Sep 2007 12:10:28 +0000 (UTC) (envelope-from ache@nagual.pp.ru) Received: from nagual.pp.ru (ache@localhost [127.0.0.1]) by nagual.pp.ru (8.14.1/8.14.1) with ESMTP id l8JCARcc081816; Wed, 19 Sep 2007 16:10:27 +0400 (MSD) (envelope-from ache@nagual.pp.ru) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=nagual.pp.ru; s=default; t=1190203827; bh=JL/Hv73FefsaA0z0rex6B157SVZKWSsrmb+tZ8R XV8w=; l=16576; h=Date:From:To:Subject:Message-ID:Mail-Followup-To: References:MIME-Version:Content-Type:Content-Disposition: In-Reply-To:User-Agent; b=dOXshvrv1dLWo+BK2XW++iPMZ58WM3dskWeXo21d 8s7nyfGQwmk7szxhyX3ffJonzcos5fYr2Sae2ykso/Twir/w5iArvqLchquBqasyJCL S8Ml9vfgb2xmaKsgRnacEZ4LdDC3YWrjPypjBmmUgcTkZ3lp/G5hLxZmH1FkTRe4= Received: (from ache@localhost) by nagual.pp.ru (8.14.1/8.14.1/Submit) id l8JCAPpd081812; Wed, 19 Sep 2007 16:10:25 +0400 (MSD) (envelope-from ache) Date: Wed, 19 Sep 2007 16:10:24 +0400 From: Andrey Chernov To: Taku YAMAMOTO , Petr Hroudn?? , current@FreeBSD.ORG, perky@FreeBSD.ORG, i18n@FreeBSD.ORG Message-ID: <20070919121024.GA81606@nagual.pp.ru> Mail-Followup-To: Andrey Chernov , Taku YAMAMOTO , Petr Hroudn?? , current@FreeBSD.ORG, perky@FreeBSD.ORG, i18n@FreeBSD.ORG References: <20070916192924.GA12678@nagual.pp.ru> <20070917092130.GA24424@nagual.pp.ru> <20070918020100.d43beb0b.taku@tackymt.homeip.net> <20070917171633.GA31179@nagual.pp.ru> <20070919111207.f37653fc.taku@tackymt.homeip.net> <20070919022555.GA70617@nagual.pp.ru> <20070919023625.GA70891@nagual.pp.ru> <20070919051830.GA72429@nagual.pp.ru> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="0OAP2g/MAC+5xKAE" Content-Disposition: inline In-Reply-To: <20070919051830.GA72429@nagual.pp.ru> User-Agent: Mutt/1.5.16 (2007-06-09) Cc: Subject: Re: Ctype patch for review X-BeenThere: freebsd-current@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Discussions about the use of FreeBSD-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 19 Sep 2007 12:10:30 -0000 --0OAP2g/MAC+5xKAE Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Wed, Sep 19, 2007 at 09:18:30AM +0400, Andrey Chernov wrote: > I change my mind again, now I use new __mb_bit8_override flag specific to > UTF-8 encoding (other bit8 overriding encodings could use it too). New > patch attached. Improved vesrsion. Intoduce general __mb_sch_limit parameter instead for all locales specifying upper limit of single char range. It allows also fix the bug when ctype(3) functions called with arg > 0xFF for wide character locales and simplifies all checks. New patch is attached. Here is updated rationale again: ------------------------------------------------------------------------- The problem is: currently our single byte ctype(3) functions are broken for wide characters locales in the argument range >= 0x80 - they may return false positives. Example 1: for UTF-8 locale we currently have: iswspace(0xA0)==1 and isspace(0xA0)==1 (because iswspace() and isspace() are the same code) but must have iswspace(0xA0)==1 and isspace(0xA0)==0 (because there is no such character and all others in the range 0x80..0xff for the UTF-8 locale, it keeps ASCII only in the single byte range because our internal wchar_t representation for UTF-8 is UCS-4). Example 2: for all wide character locales isalpha(arg) when arg > 0xFF may return false positives (must be 0). (because iswalpha() and isalpha() are the same code) Attached patch address this issue and also fix iswascii() (currently iswascii() is broken for arguments > 0xFF). This patch is 100% binary compatible with old binaries. -- http://ache.pp.ru/ --0OAP2g/MAC+5xKAE Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="ctype.patch" --- _ctype.h.old 2007-09-16 21:13:59.000000000 +0400 +++ _ctype.h 2007-09-19 15:29:41.000000000 +0400 @@ -63,6 +63,7 @@ #define _CTYPE_I 0x00080000L /* Ideogram */ #define _CTYPE_T 0x00100000L /* Special */ #define _CTYPE_Q 0x00200000L /* Phonogram */ +#define _CTYPE_WID 0x10000000L /* wide character function */ #define _CTYPE_SW0 0x20000000L /* 0 width character */ #define _CTYPE_SW1 0x40000000L /* 1 width character */ #define _CTYPE_SW2 0x80000000L /* 2 width character */ @@ -87,6 +88,8 @@ #define __inline #endif +extern int __mb_sch_limit; + /* * Use inline functions if we are allowed to and the compiler supports them. */ @@ -98,8 +101,11 @@ static __inline int __maskrune(__ct_rune_t _c, unsigned long _f) { - return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) : + return (_c < 0 || (!(_f & _CTYPE_WID) && _c >= __mb_sch_limit)) ? 0 : + (_c >= _CACHED_RUNES ? ___runetype(_c) : _CurrentRuneLocale->__runetype[_c]) & _f; + /* We never set _CTYPE_WID in the locale data, */ + /* so can skip ... & (_f & ~_CTYPE_WID). */ } static __inline int @@ -111,7 +117,7 @@ static __inline int __isctype(__ct_rune_t _c, unsigned long _f) { - return (_c < 0 || _c >= _CACHED_RUNES) ? 0 : + return (_c < 0 || _c >= __mb_sch_limit) ? 0 : !!(_DefaultRuneLocale.__runetype[_c] & _f); } @@ -129,6 +135,20 @@ _CurrentRuneLocale->__maplower[_c]; } +static __inline __ct_rune_t +__tosupper(__ct_rune_t _c) +{ + return (_c < 0 || _c >= __mb_sch_limit) ? _c : + _CurrentRuneLocale->__mapupper[_c]; +} + +static __inline __ct_rune_t +__toslower(__ct_rune_t _c) +{ + return (_c < 0 || _c >= __mb_sch_limit) ? _c : + _CurrentRuneLocale->__maplower[_c]; +} + static __inline int __wcwidth(__ct_rune_t _c) { @@ -150,6 +170,8 @@ int __isctype(__ct_rune_t, unsigned long); __ct_rune_t __toupper(__ct_rune_t); __ct_rune_t __tolower(__ct_rune_t); +__ct_rune_t __tosupper(__ct_rune_t); +__ct_rune_t __toslower(__ct_rune_t); int __wcwidth(__ct_rune_t); __END_DECLS #endif /* using inlines */ --- big5.c.old 2007-09-19 08:48:55.000000000 +0400 +++ big5.c 2007-09-19 15:41:26.000000000 +0400 @@ -49,6 +49,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _BIG5_mbsinit(const mbstate_t *); @@ -68,6 +70,7 @@ __mbsinit = _BIG5_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sch_limit = 256; return (0); } --- ctype.h.old 2007-09-16 22:03:55.000000000 +0400 +++ ctype.h 2007-09-16 22:56:10.000000000 +0400 @@ -97,8 +97,8 @@ #define isspace(c) __istype((c), _CTYPE_S) #define isupper(c) __istype((c), _CTYPE_U) #define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */ -#define tolower(c) __tolower(c) -#define toupper(c) __toupper(c) +#define tolower(c) __toslower(c) +#define toupper(c) __tosupper(c) #if __XSI_VISIBLE /* @@ -112,8 +112,8 @@ * * XXX isascii() and toascii() should similarly be undocumented. */ -#define _tolower(c) __tolower(c) -#define _toupper(c) __toupper(c) +#define _tolower(c) __toslower(c) +#define _toupper(c) __tosupper(c) #define isascii(c) (((c) & ~0x7F) == 0) #define toascii(c) ((c) & 0x7F) #endif @@ -128,7 +128,7 @@ #define isideogram(c) __istype((c), _CTYPE_I) #define isnumber(c) __istype((c), _CTYPE_D) #define isphonogram(c) __istype((c), _CTYPE_Q) -#define isrune(c) __istype((c), 0xFFFFFF00L) +#define isrune(c) __istype((c), 0xFFFFFF00L & ~_CTYPE_WID) #define isspecial(c) __istype((c), _CTYPE_T) #endif --- euc.c.old 2007-09-19 08:50:57.000000000 +0400 +++ euc.c 2007-09-19 15:41:26.000000000 +0400 @@ -49,6 +49,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _EUC_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _EUC_mbsinit(const mbstate_t *); @@ -116,6 +118,7 @@ __mbrtowc = _EUC_mbrtowc; __wcrtomb = _EUC_wcrtomb; __mbsinit = _EUC_mbsinit; + __mb_sch_limit = 256; return (0); } --- gb18030.c.old 2007-09-19 08:59:01.000000000 +0400 +++ gb18030.c 2007-09-19 15:41:26.000000000 +0400 @@ -39,6 +39,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GB18030_mbsinit(const mbstate_t *); @@ -59,6 +61,7 @@ __mbsinit = _GB18030_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 4; + __mb_sch_limit = 256; return (0); } --- gb2312.c.old 2007-09-19 09:00:35.000000000 +0400 +++ gb2312.c 2007-09-19 15:41:26.000000000 +0400 @@ -35,6 +35,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _GB2312_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GB2312_mbsinit(const mbstate_t *); @@ -55,6 +57,7 @@ __wcrtomb = _GB2312_wcrtomb; __mbsinit = _GB2312_mbsinit; __mb_cur_max = 2; + __mb_sch_limit = 256; return (0); } --- gbk.c.old 2007-09-19 09:01:33.000000000 +0400 +++ gbk.c 2007-09-19 15:41:26.000000000 +0400 @@ -42,6 +42,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _GBK_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GBK_mbsinit(const mbstate_t *); @@ -61,6 +63,7 @@ __mbsinit = _GBK_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sch_limit = 256; return (0); } --- isctype.c.old 2007-09-16 22:31:26.000000000 +0400 +++ isctype.c 2007-09-16 22:37:54.000000000 +0400 @@ -168,7 +168,7 @@ isrune(c) int c; { - return (__istype(c, 0xFFFFFF00L)); + return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID)); } #undef isspace @@ -216,7 +216,7 @@ tolower(c) int c; { - return (__tolower(c)); + return (__toslower(c)); } #undef toupper @@ -224,6 +224,6 @@ toupper(c) int c; { - return (__toupper(c)); + return (__tosupper(c)); } --- iswctype.c.old 2007-09-16 22:31:30.000000000 +0400 +++ iswctype.c 2007-09-19 15:45:26.000000000 +0400 @@ -45,7 +45,7 @@ iswalnum(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A|_CTYPE_D)); + return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID)); } #undef iswalpha @@ -53,7 +53,7 @@ iswalpha(wc) wint_t wc; { - return (__istype(wc, _CTYPE_A)); + return (__istype(wc, _CTYPE_A|_CTYPE_WID)); } #undef iswascii @@ -61,7 +61,7 @@ iswascii(wc) wint_t wc; { - return ((wc & ~0x7F) == 0); + return (wc < 0x80); } #undef iswblank @@ -69,7 +69,7 @@ iswblank(wc) wint_t wc; { - return (__istype(wc, _CTYPE_B)); + return (__istype(wc, _CTYPE_B|_CTYPE_WID)); } #undef iswcntrl @@ -77,7 +77,7 @@ iswcntrl(wc) wint_t wc; { - return (__istype(wc, _CTYPE_C)); + return (__istype(wc, _CTYPE_C|_CTYPE_WID)); } #undef iswdigit @@ -93,7 +93,7 @@ iswgraph(wc) wint_t wc; { - return (__istype(wc, _CTYPE_G)); + return (__istype(wc, _CTYPE_G|_CTYPE_WID)); } #undef iswhexnumber @@ -101,7 +101,7 @@ iswhexnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_X)); + return (__istype(wc, _CTYPE_X|_CTYPE_WID)); } #undef iswideogram @@ -109,7 +109,7 @@ iswideogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_I)); + return (__istype(wc, _CTYPE_I|_CTYPE_WID)); } #undef iswlower @@ -117,7 +117,7 @@ iswlower(wc) wint_t wc; { - return (__istype(wc, _CTYPE_L)); + return (__istype(wc, _CTYPE_L|_CTYPE_WID)); } #undef iswnumber @@ -125,7 +125,7 @@ iswnumber(wc) wint_t wc; { - return (__istype(wc, _CTYPE_D)); + return (__istype(wc, _CTYPE_D|_CTYPE_WID)); } #undef iswphonogram @@ -133,7 +133,7 @@ iswphonogram(wc) wint_t wc; { - return (__istype(wc, _CTYPE_Q)); + return (__istype(wc, _CTYPE_Q|_CTYPE_WID)); } #undef iswprint @@ -141,7 +141,7 @@ iswprint(wc) wint_t wc; { - return (__istype(wc, _CTYPE_R)); + return (__istype(wc, _CTYPE_R|_CTYPE_WID)); } #undef iswpunct @@ -149,7 +149,7 @@ iswpunct(wc) wint_t wc; { - return (__istype(wc, _CTYPE_P)); + return (__istype(wc, _CTYPE_P|_CTYPE_WID)); } #undef iswrune @@ -157,7 +157,7 @@ iswrune(wc) wint_t wc; { - return (__istype(wc, 0xFFFFFF00L)); + return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */ } #undef iswspace @@ -165,7 +165,7 @@ iswspace(wc) wint_t wc; { - return (__istype(wc, _CTYPE_S)); + return (__istype(wc, _CTYPE_S|_CTYPE_WID)); } #undef iswspecial @@ -173,7 +173,7 @@ iswspecial(wc) wint_t wc; { - return (__istype(wc, _CTYPE_T)); + return (__istype(wc, _CTYPE_T|_CTYPE_WID)); } #undef iswupper @@ -181,7 +181,7 @@ iswupper(wc) wint_t wc; { - return (__istype(wc, _CTYPE_U)); + return (__istype(wc, _CTYPE_U|_CTYPE_WID)); } #undef iswxdigit --- mskanji.c.old 2007-09-19 09:02:56.000000000 +0400 +++ mskanji.c 2007-09-19 15:41:26.000000000 +0400 @@ -47,6 +47,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _MSKanji_mbsinit(const mbstate_t *); @@ -66,6 +68,7 @@ __mbsinit = _MSKanji_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sch_limit = 256; return (0); } --- none.c.old 2007-09-19 08:56:40.000000000 +0400 +++ none.c 2007-09-19 15:51:44.000000000 +0400 @@ -58,6 +58,11 @@ static size_t _none_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, size_t, size_t, mbstate_t * __restrict); +/* setup defaults */ + +int __mb_cur_max = 1; +int __mb_sch_limit = 256; /* Expected to be <= _CACHED_RUNES */ + int _none_init(_RuneLocale *rl) { @@ -69,6 +74,7 @@ __wcsnrtombs = _none_wcsnrtombs; _CurrentRuneLocale = rl; __mb_cur_max = 1; + __mb_sch_limit = 256; return(0); } @@ -176,7 +182,6 @@ /* setup defaults */ -int __mb_cur_max = 1; size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict) = _none_mbrtowc; int (*__mbsinit)(const mbstate_t *) = _none_mbsinit; --- setrunelocale.c.old 2007-09-19 09:03:59.000000000 +0400 +++ setrunelocale.c 2007-09-19 15:41:26.000000000 +0400 @@ -45,6 +45,8 @@ #include "mblocal.h" #include "setlocale.h" +extern int __mb_sch_limit; + extern _RuneLocale *_Read_RuneMagi(FILE *); static int __setrunelocale(const char *); @@ -59,6 +61,7 @@ static char ctype_encoding[ENCODING_LEN + 1]; static _RuneLocale *CachedRuneLocale; static int Cached__mb_cur_max; + static int Cached__mb_sch_limit; static size_t (*Cached__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static size_t (*Cached__wcrtomb)(char * __restrict, wchar_t, @@ -85,6 +88,7 @@ strcmp(encoding, ctype_encoding) == 0) { _CurrentRuneLocale = CachedRuneLocale; __mb_cur_max = Cached__mb_cur_max; + __mb_sch_limit = Cached__mb_sch_limit; __mbrtowc = Cached__mbrtowc; __mbsinit = Cached__mbsinit; __mbsnrtowcs = Cached__mbsnrtowcs; @@ -147,6 +151,7 @@ } CachedRuneLocale = _CurrentRuneLocale; Cached__mb_cur_max = __mb_cur_max; + Cached__mb_sch_limit = __mb_sch_limit; Cached__mbrtowc = __mbrtowc; Cached__mbsinit = __mbsinit; Cached__mbsnrtowcs = __mbsnrtowcs; --- utf8.c.old 2007-09-19 08:18:40.000000000 +0400 +++ utf8.c 2007-09-19 15:55:35.000000000 +0400 @@ -35,6 +35,8 @@ #include #include "mblocal.h" +extern int __mb_sch_limit; + static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _UTF8_mbsinit(const mbstate_t *); @@ -63,6 +65,7 @@ __wcsnrtombs = _UTF8_wcsnrtombs; _CurrentRuneLocale = rl; __mb_cur_max = 6; + __mb_sch_limit = 128; return (0); } --- wctype.h.old 2007-09-16 21:59:37.000000000 +0400 +++ wctype.h 2007-09-19 15:31:40.000000000 +0400 @@ -89,30 +89,30 @@ #endif __END_DECLS -#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D) -#define iswalpha(wc) __istype((wc), _CTYPE_A) -#define iswblank(wc) __istype((wc), _CTYPE_B) -#define iswcntrl(wc) __istype((wc), _CTYPE_C) -#define iswctype(wc, charclass) __istype((wc), (charclass)) +#define iswalnum(wc) __istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID) +#define iswalpha(wc) __istype((wc), _CTYPE_A|_CTYPE_WID) +#define iswblank(wc) __istype((wc), _CTYPE_B|_CTYPE_WID) +#define iswcntrl(wc) __istype((wc), _CTYPE_C|_CTYPE_WID) +#define iswctype(wc, charclass) __istype((wc), (charclass)|_CTYPE_WID) #define iswdigit(wc) __isctype((wc), _CTYPE_D) -#define iswgraph(wc) __istype((wc), _CTYPE_G) -#define iswlower(wc) __istype((wc), _CTYPE_L) -#define iswprint(wc) __istype((wc), _CTYPE_R) -#define iswpunct(wc) __istype((wc), _CTYPE_P) -#define iswspace(wc) __istype((wc), _CTYPE_S) -#define iswupper(wc) __istype((wc), _CTYPE_U) +#define iswgraph(wc) __istype((wc), _CTYPE_G|_CTYPE_WID) +#define iswlower(wc) __istype((wc), _CTYPE_L|_CTYPE_WID) +#define iswprint(wc) __istype((wc), _CTYPE_R|_CTYPE_WID) +#define iswpunct(wc) __istype((wc), _CTYPE_P|_CTYPE_WID) +#define iswspace(wc) __istype((wc), _CTYPE_S|_CTYPE_WID) +#define iswupper(wc) __istype((wc), _CTYPE_U|_CTYPE_WID) #define iswxdigit(wc) __isctype((wc), _CTYPE_X) #define towlower(wc) __tolower(wc) #define towupper(wc) __toupper(wc) #if __BSD_VISIBLE -#define iswascii(wc) (((wc) & ~0x7F) == 0) -#define iswhexnumber(wc) __istype((wc), _CTYPE_X) -#define iswideogram(wc) __istype((wc), _CTYPE_I) -#define iswnumber(wc) __istype((wc), _CTYPE_D) -#define iswphonogram(wc) __istype((wc), _CTYPE_Q) -#define iswrune(wc) __istype((wc), 0xFFFFFF00L) -#define iswspecial(wc) __istype((wc), _CTYPE_T) +#define iswascii(wc) ((wc) < 0x80) +#define iswhexnumber(wc) __istype((wc), _CTYPE_X|_CTYPE_WID) +#define iswideogram(wc) __istype((wc), _CTYPE_I|_CTYPE_WID) +#define iswnumber(wc) __istype((wc), _CTYPE_D|_CTYPE_WID) +#define iswphonogram(wc) __istype((wc), _CTYPE_Q|_CTYPE_WID) +#define iswrune(wc) __istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */ +#define iswspecial(wc) __istype((wc), _CTYPE_T|_CTYPE_WID) #endif #endif /* _WCTYPE_H_ */ --0OAP2g/MAC+5xKAE--