Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 19 Sep 2007 09:18:30 +0400
From:      Andrey Chernov <ache@nagual.pp.ru>
To:        Taku YAMAMOTO <taku@tackymt.homeip.net>, Petr Hroudn?? <petr.hroudny@gmail.com>, current@FreeBSD.ORG, perky@FreeBSD.ORG, i18n@FreeBSD.ORG
Subject:   Re: Ctype patch for review
Message-ID:  <20070919051830.GA72429@nagual.pp.ru>
In-Reply-To: <20070919023625.GA70891@nagual.pp.ru>
References:  <20070916192924.GA12678@nagual.pp.ru> <ab8fc7f50709170129p6f436069iffaf697e83a34e3c@mail.gmail.com> <20070917092130.GA24424@nagual.pp.ru> <20070918020100.d43beb0b.taku@tackymt.homeip.net> <20070917171633.GA31179@nagual.pp.ru> <20070919111207.f37653fc.taku@tackymt.homeip.net> <20070919022555.GA70617@nagual.pp.ru> <20070919023625.GA70891@nagual.pp.ru>

next in thread | previous in thread | raw e-mail | index | archive | help

--OgqxwSJOaUobr8KG
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Wed, Sep 19, 2007 at 06:36:25AM +0400, Andrey Chernov wrote:
> only UTF-8.src not following the rules. I'll send regenerated UTF-8.src 
> a bit later.

I change my mind again, now I use new __mb_bit8_override flag specific to 
UTF-8 encoding (other bit8 overriding encodings could use it too). New 
patch attached.

-- 
http://ache.pp.ru/

--OgqxwSJOaUobr8KG
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="ctype.patch"

--- _ctype.h.old	2007-09-16 21:13:59.000000000 +0400
+++ _ctype.h	2007-09-19 08:46:35.000000000 +0400
@@ -63,6 +63,7 @@
 #define	_CTYPE_I	0x00080000L		/* Ideogram */
 #define	_CTYPE_T	0x00100000L		/* Special */
 #define	_CTYPE_Q	0x00200000L		/* Phonogram */
+#define	_CTYPE_WID	0x10000000L		/* wide character function */
 #define	_CTYPE_SW0	0x20000000L		/* 0 width character */
 #define	_CTYPE_SW1	0x40000000L		/* 1 width character */
 #define	_CTYPE_SW2	0x80000000L		/* 2 width character */
@@ -87,6 +88,8 @@
 #define	__inline
 #endif
 
+extern int __mb_bit8_override;
+
 /*
  * Use inline functions if we are allowed to and the compiler supports them.
  */
@@ -98,8 +101,11 @@
 static __inline int
 __maskrune(__ct_rune_t _c, unsigned long _f)
 {
-	return ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
+	return __mb_bit8_override && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+	       ((_c < 0 || _c >= _CACHED_RUNES) ? ___runetype(_c) :
 		_CurrentRuneLocale->__runetype[_c]) & _f;
+		/* We never set _CTYPE_WID in the locale data, */
+		/* so can skip ... & (_f & ~_CTYPE_WID).       */
 }
 
 static __inline int
@@ -111,8 +117,11 @@
 static __inline int
 __isctype(__ct_rune_t _c, unsigned long _f)
 {
-	return (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
+	return  __mb_bit8_override && !(_f & _CTYPE_WID) && (_c >= 0x80) ? 0 :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? 0 :
 	       !!(_DefaultRuneLocale.__runetype[_c] & _f);
+		  /* We never set _CTYPE_WID in the locale data, */
+		  /* so can skip ... & (_f & ~_CTYPE_WID).	 */
 }
 
 static __inline __ct_rune_t
@@ -129,6 +138,22 @@
 	       _CurrentRuneLocale->__maplower[_c];
 }
 
+static __inline __ct_rune_t
+__tosupper(__ct_rune_t _c)
+{
+	return  __mb_bit8_override && (_c >= 0x80) ? _c :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? ___toupper(_c) :
+	       _CurrentRuneLocale->__mapupper[_c];
+}
+
+static __inline __ct_rune_t
+__toslower(__ct_rune_t _c)
+{
+	return  __mb_bit8_override && (_c >= 0x80) ? _c :
+	       (_c < 0 || _c >= _CACHED_RUNES) ? ___tolower(_c) :
+	       _CurrentRuneLocale->__maplower[_c];
+}
+
 static __inline int
 __wcwidth(__ct_rune_t _c)
 {
@@ -150,6 +175,8 @@
 int		__isctype(__ct_rune_t, unsigned long);
 __ct_rune_t	__toupper(__ct_rune_t);
 __ct_rune_t	__tolower(__ct_rune_t);
+__ct_rune_t	__tosupper(__ct_rune_t);
+__ct_rune_t	__toslower(__ct_rune_t);
 int		__wcwidth(__ct_rune_t);
 __END_DECLS
 #endif /* using inlines */
--- big5.c.old	2007-09-19 08:48:55.000000000 +0400
+++ big5.c	2007-09-19 08:56:12.000000000 +0400
@@ -49,6 +49,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_BIG5_mbsinit(const mbstate_t *);
@@ -68,6 +70,7 @@
 	__mbsinit = _BIG5_mbsinit;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 2;
+	__mb_bit8_override = 0;
 	return (0);
 }
 
--- ctype.h.old	2007-09-16 22:03:55.000000000 +0400
+++ ctype.h	2007-09-16 22:56:10.000000000 +0400
@@ -97,8 +97,8 @@
 #define	isspace(c)	__istype((c), _CTYPE_S)
 #define	isupper(c)	__istype((c), _CTYPE_U)
 #define	isxdigit(c)	__isctype((c), _CTYPE_X) /* ANSI -- locale independent */
-#define	tolower(c)	__tolower(c)
-#define	toupper(c)	__toupper(c)
+#define	tolower(c)	__toslower(c)
+#define	toupper(c)	__tosupper(c)
 
 #if __XSI_VISIBLE
 /*
@@ -112,8 +112,8 @@
  *
  * XXX isascii() and toascii() should similarly be undocumented.
  */
-#define	_tolower(c)	__tolower(c)
-#define	_toupper(c)	__toupper(c)
+#define	_tolower(c)	__toslower(c)
+#define	_toupper(c)	__tosupper(c)
 #define	isascii(c)	(((c) & ~0x7F) == 0)
 #define	toascii(c)	((c) & 0x7F)
 #endif
@@ -128,7 +128,7 @@
 #define	isideogram(c)	__istype((c), _CTYPE_I)
 #define	isnumber(c)	__istype((c), _CTYPE_D)
 #define	isphonogram(c)	__istype((c), _CTYPE_Q)
-#define	isrune(c)	__istype((c), 0xFFFFFF00L)
+#define	isrune(c)	__istype((c), 0xFFFFFF00L & ~_CTYPE_WID)
 #define	isspecial(c)	__istype((c), _CTYPE_T)
 #endif
 
--- euc.c.old	2007-09-19 08:50:57.000000000 +0400
+++ euc.c	2007-09-19 08:56:12.000000000 +0400
@@ -49,6 +49,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_EUC_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_EUC_mbsinit(const mbstate_t *);
@@ -116,6 +118,7 @@
 	__mbrtowc = _EUC_mbrtowc;
 	__wcrtomb = _EUC_wcrtomb;
 	__mbsinit = _EUC_mbsinit;
+	__mb_bit8_override = 0;
 	return (0);
 }
 
--- gb18030.c.old	2007-09-19 08:59:01.000000000 +0400
+++ gb18030.c	2007-09-19 09:00:10.000000000 +0400
@@ -39,6 +39,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_GB18030_mbsinit(const mbstate_t *);
@@ -59,6 +61,7 @@
 	__mbsinit = _GB18030_mbsinit;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 4;
+	__mb_bit8_override = 0;
 
 	return (0);
 }
--- gb2312.c.old	2007-09-19 09:00:35.000000000 +0400
+++ gb2312.c	2007-09-19 09:01:05.000000000 +0400
@@ -35,6 +35,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_GB2312_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_GB2312_mbsinit(const mbstate_t *);
@@ -55,6 +57,7 @@
 	__wcrtomb = _GB2312_wcrtomb;
 	__mbsinit = _GB2312_mbsinit;
 	__mb_cur_max = 2;
+	__mb_bit8_override = 0;
 	return (0);
 }
 
--- gbk.c.old	2007-09-19 09:01:33.000000000 +0400
+++ gbk.c	2007-09-19 09:02:03.000000000 +0400
@@ -42,6 +42,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_GBK_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_GBK_mbsinit(const mbstate_t *);
@@ -61,6 +63,7 @@
 	__mbsinit = _GBK_mbsinit;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 2;
+	__mb_bit8_override = 0;
 	return (0);
 }
 
--- isctype.c.old	2007-09-16 22:31:26.000000000 +0400
+++ isctype.c	2007-09-16 22:37:54.000000000 +0400
@@ -168,7 +168,7 @@
 isrune(c)
 	int c;
 {
-	return (__istype(c, 0xFFFFFF00L));
+	return (__istype(c, 0xFFFFFF00L & ~_CTYPE_WID));
 }
 
 #undef isspace
@@ -216,7 +216,7 @@
 tolower(c)
 	int c;
 {
-        return (__tolower(c));
+	return (__toslower(c));
 }
 
 #undef toupper
@@ -224,6 +224,6 @@
 toupper(c)
 	int c;
 {
-        return (__toupper(c));
+	return (__tosupper(c));
 }
 
--- iswctype.c.old	2007-09-16 22:31:30.000000000 +0400
+++ iswctype.c	2007-09-16 22:41:39.000000000 +0400
@@ -45,7 +45,7 @@
 iswalnum(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_A|_CTYPE_D));
+	return (__istype(wc, _CTYPE_A|_CTYPE_D|_CTYPE_WID));
 }
 
 #undef iswalpha
@@ -53,7 +53,7 @@
 iswalpha(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_A));
+	return (__istype(wc, _CTYPE_A|_CTYPE_WID)));
 }
 
 #undef iswascii
@@ -61,7 +61,7 @@
 iswascii(wc)
 	wint_t wc;
 {
-	return ((wc & ~0x7F) == 0);
+	return (wc < 0x80);
 }
 
 #undef iswblank
@@ -69,7 +69,7 @@
 iswblank(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_B));
+	return (__istype(wc, _CTYPE_B|_CTYPE_WID)));
 }
 
 #undef iswcntrl
@@ -77,7 +77,7 @@
 iswcntrl(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_C));
+	return (__istype(wc, _CTYPE_C|_CTYPE_WID)));
 }
 
 #undef iswdigit
@@ -85,7 +85,7 @@
 iswdigit(wc)
 	wint_t wc;
 {
-	return (__isctype(wc, _CTYPE_D));
+	return (__isctype(wc, _CTYPE_D|_CTYPE_WID)));
 }
 
 #undef iswgraph
@@ -93,7 +93,7 @@
 iswgraph(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_G));
+	return (__istype(wc, _CTYPE_G|_CTYPE_WID)));
 }
 
 #undef iswhexnumber 
@@ -101,7 +101,7 @@
 iswhexnumber(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_X));
+	return (__istype(wc, _CTYPE_X|_CTYPE_WID)));
 }
 
 #undef iswideogram
@@ -109,7 +109,7 @@
 iswideogram(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_I));
+	return (__istype(wc, _CTYPE_I|_CTYPE_WID)));
 }
 
 #undef iswlower
@@ -117,7 +117,7 @@
 iswlower(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_L));
+	return (__istype(wc, _CTYPE_L|_CTYPE_WID)));
 }
 
 #undef iswnumber
@@ -125,7 +125,7 @@
 iswnumber(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_D));
+	return (__istype(wc, _CTYPE_D|_CTYPE_WID)));
 }
 
 #undef iswphonogram	
@@ -133,7 +133,7 @@
 iswphonogram(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_Q));
+	return (__istype(wc, _CTYPE_Q|_CTYPE_WID)));
 }
 
 #undef iswprint
@@ -141,7 +141,7 @@
 iswprint(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_R));
+	return (__istype(wc, _CTYPE_R|_CTYPE_WID)));
 }
 
 #undef iswpunct
@@ -149,7 +149,7 @@
 iswpunct(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_P));
+	return (__istype(wc, _CTYPE_P|_CTYPE_WID)));
 }
 
 #undef iswrune
@@ -157,7 +157,7 @@
 iswrune(wc)
 	wint_t wc;
 {
-	return (__istype(wc, 0xFFFFFF00L));
+	return (__istype(wc, 0xFFFFFF00L)); /* already have _CTYPE_WID */
 }
 
 #undef iswspace
@@ -165,7 +165,7 @@
 iswspace(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_S));
+	return (__istype(wc, _CTYPE_S|_CTYPE_WID)));
 }
 
 #undef iswspecial
@@ -173,7 +173,7 @@
 iswspecial(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_T));
+	return (__istype(wc, _CTYPE_T|_CTYPE_WID)));
 }
 
 #undef iswupper
@@ -181,7 +181,7 @@
 iswupper(wc)
 	wint_t wc;
 {
-	return (__istype(wc, _CTYPE_U));
+	return (__istype(wc, _CTYPE_U|_CTYPE_WID)));
 }
 
 #undef iswxdigit
@@ -189,7 +189,7 @@
 iswxdigit(wc)
 	wint_t wc;
 {
-	return (__isctype(wc, _CTYPE_X));
+	return (__isctype(wc, _CTYPE_X|_CTYPE_WID)));
 }
 
 #undef towlower
--- mskanji.c.old	2007-09-19 09:02:56.000000000 +0400
+++ mskanji.c	2007-09-19 09:03:26.000000000 +0400
@@ -47,6 +47,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_MSKanji_mbsinit(const mbstate_t *);
@@ -66,6 +68,7 @@
 	__mbsinit = _MSKanji_mbsinit;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 2;
+	__mb_bit8_override = 0;
 	return (0);
 }
 
--- none.c.old	2007-09-19 08:56:40.000000000 +0400
+++ none.c	2007-09-19 08:58:23.000000000 +0400
@@ -69,6 +69,7 @@
 	__wcsnrtombs = _none_wcsnrtombs;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 1;
+	__mb_bit8_override = 0;
 	return(0);
 }
 
@@ -177,6 +178,7 @@
 /* setup defaults */
 
 int __mb_cur_max = 1;
+int __mb_bit8_override = 0;
 size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t,
     mbstate_t * __restrict) = _none_mbrtowc;
 int (*__mbsinit)(const mbstate_t *) = _none_mbsinit;
--- setrunelocale.c.old	2007-09-19 09:03:59.000000000 +0400
+++ setrunelocale.c	2007-09-19 09:06:45.000000000 +0400
@@ -45,6 +45,8 @@
 #include "mblocal.h"
 #include "setlocale.h"
 
+extern int __mb_bit8_override;
+
 extern _RuneLocale	*_Read_RuneMagi(FILE *);
 
 static int		__setrunelocale(const char *);
@@ -59,6 +61,7 @@
 	static char ctype_encoding[ENCODING_LEN + 1];
 	static _RuneLocale *CachedRuneLocale;
 	static int Cached__mb_cur_max;
+	static int Cached__mb_bit8_override;
 	static size_t (*Cached__mbrtowc)(wchar_t * __restrict,
 	    const char * __restrict, size_t, mbstate_t * __restrict);
 	static size_t (*Cached__wcrtomb)(char * __restrict, wchar_t,
@@ -85,6 +88,7 @@
 	    strcmp(encoding, ctype_encoding) == 0) {
 		_CurrentRuneLocale = CachedRuneLocale;
 		__mb_cur_max = Cached__mb_cur_max;
+		__mb_bit8_override = Cached__mb_bit8_override;
 		__mbrtowc = Cached__mbrtowc;
 		__mbsinit = Cached__mbsinit;
 		__mbsnrtowcs = Cached__mbsnrtowcs;
@@ -147,6 +151,7 @@
 		}
 		CachedRuneLocale = _CurrentRuneLocale;
 		Cached__mb_cur_max = __mb_cur_max;
+		Cached__mb_bit8_override = __mb_bit8_override;
 		Cached__mbrtowc = __mbrtowc;
 		Cached__mbsinit = __mbsinit;
 		Cached__mbsnrtowcs = __mbsnrtowcs;
--- utf8.c.old	2007-09-19 08:18:40.000000000 +0400
+++ utf8.c	2007-09-19 08:56:12.000000000 +0400
@@ -35,6 +35,8 @@
 #include <wchar.h>
 #include "mblocal.h"
 
+extern int __mb_bit8_override;
+
 static size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
 		    size_t, mbstate_t * __restrict);
 static int	_UTF8_mbsinit(const mbstate_t *);
@@ -63,6 +65,7 @@
 	__wcsnrtombs = _UTF8_wcsnrtombs;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 6;
+	__mb_bit8_override = 1;
 
 	return (0);
 }
--- wctype.h.old	2007-09-16 21:59:37.000000000 +0400
+++ wctype.h	2007-09-16 22:56:44.000000000 +0400
@@ -89,30 +89,30 @@
 #endif
 __END_DECLS
 
-#define	iswalnum(wc)		__istype((wc), _CTYPE_A|_CTYPE_D)
-#define	iswalpha(wc)		__istype((wc), _CTYPE_A)
-#define	iswblank(wc)		__istype((wc), _CTYPE_B)
-#define	iswcntrl(wc)		__istype((wc), _CTYPE_C)
-#define	iswctype(wc, charclass)	__istype((wc), (charclass))
-#define	iswdigit(wc)		__isctype((wc), _CTYPE_D)
-#define	iswgraph(wc)		__istype((wc), _CTYPE_G)
-#define	iswlower(wc)		__istype((wc), _CTYPE_L)
-#define	iswprint(wc)		__istype((wc), _CTYPE_R)
-#define	iswpunct(wc)		__istype((wc), _CTYPE_P)
-#define	iswspace(wc)		__istype((wc), _CTYPE_S)
-#define	iswupper(wc)		__istype((wc), _CTYPE_U)
-#define	iswxdigit(wc)		__isctype((wc), _CTYPE_X)
+#define	iswalnum(wc)		__istype((wc), _CTYPE_A|_CTYPE_D|_CTYPE_WID)
+#define	iswalpha(wc)		__istype((wc), _CTYPE_A|_CTYPE_WID)
+#define	iswblank(wc)		__istype((wc), _CTYPE_B|_CTYPE_WID)
+#define	iswcntrl(wc)		__istype((wc), _CTYPE_C|_CTYPE_WID)
+#define	iswctype(wc, charclass)	__istype((wc), (charclass)|_CTYPE_WID)
+#define	iswdigit(wc)		__isctype((wc), _CTYPE_D|_CTYPE_WID)
+#define	iswgraph(wc)		__istype((wc), _CTYPE_G|_CTYPE_WID)
+#define	iswlower(wc)		__istype((wc), _CTYPE_L|_CTYPE_WID)
+#define	iswprint(wc)		__istype((wc), _CTYPE_R|_CTYPE_WID)
+#define	iswpunct(wc)		__istype((wc), _CTYPE_P|_CTYPE_WID)
+#define	iswspace(wc)		__istype((wc), _CTYPE_S|_CTYPE_WID)
+#define	iswupper(wc)		__istype((wc), _CTYPE_U|_CTYPE_WID)
+#define	iswxdigit(wc)		__isctype((wc), _CTYPE_X|_CTYPE_WID)
 #define	towlower(wc)		__tolower(wc)
 #define	towupper(wc)		__toupper(wc)
 
 #if __BSD_VISIBLE
-#define	iswascii(wc)		(((wc) & ~0x7F) == 0)
-#define	iswhexnumber(wc)	__istype((wc), _CTYPE_X)
-#define	iswideogram(wc)		__istype((wc), _CTYPE_I)
-#define	iswnumber(wc)		__istype((wc), _CTYPE_D)
-#define	iswphonogram(wc)	__istype((wc), _CTYPE_Q)
-#define	iswrune(wc)		__istype((wc), 0xFFFFFF00L)
-#define	iswspecial(wc)		__istype((wc), _CTYPE_T)
+#define	iswascii(wc)		((wc) < 0x80)
+#define	iswhexnumber(wc)	__istype((wc), _CTYPE_X|_CTYPE_WID)
+#define	iswideogram(wc)		__istype((wc), _CTYPE_I|_CTYPE_WID)
+#define	iswnumber(wc)		__istype((wc), _CTYPE_D|_CTYPE_WID)
+#define	iswphonogram(wc)	__istype((wc), _CTYPE_Q|_CTYPE_WID)
+#define	iswrune(wc)		__istype((wc), 0xFFFFFF00L) /* already have _CTYPE_WID */
+#define	iswspecial(wc)		__istype((wc), _CTYPE_T|_CTYPE_WID)
 #endif
 
 #endif		/* _WCTYPE_H_ */

--OgqxwSJOaUobr8KG--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20070919051830.GA72429>