Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 25 Feb 2011 15:53:11 +0100
From:      Jilles Tjoelker <jilles@stack.nl>
To:        freebsd-hackers@freebsd.org, freebsd-i18n@freebsd.org
Subject:   Basic UTF-8 support for sh(1)
Message-ID:  <20110225145311.GA4423@stack.nl>

next in thread | raw e-mail | index | archive | help

--ikeVEW9yuYc//A+q
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Here is a patch that adds basic UTF-8 support to sh(1). This is enabled
if the locale is set appropriately.

Features:
* ${#var} counts codepoints. (Really, bytes with (b & 0xc0) != 0x80.)
* ?, [...] patterns match codepoints instead of bytes. They do not match
  invalid sequences. This is so that ${var#?} removes the first
  codepoint, not the first byte. However, * continues to match any
  string and an invalid sequence matches an identical invalid sequence.
  (This differs from fnmatch(3).)

Internal:
* CTL* bytes are moved to bytes that cannot occur in UTF-8 so that
  mbrtowc(3) can be used directly. The new locations do occur in
  iso-8859-* encodings.

Limitations:
* Only UTF-8 support is added, not any other multibyte encodings. I do
  not want to bloat up sh with mbrtowc(3) and similar everywhere.
* Invalid sequences may not be handled as desired. It seems aborting on
  invalid UTF-8 sequences would break things, so they are let through.
  This also avoids bloating the code up with checking everywhere.
* There is no special treatment for combining characters, accented
  letters may match ? or ?? or even more depending on normalization
  form. This matches other code in FreeBSD and is usually good enough
  because normalization forms that use as few codepoints as possible
  tend to be used.
* IFS remains byte-based as in ksh93 (but unlike bash and zsh).
* Our version of libedit does not support UTF-8 so sh will still be
  rather unpleasant to use interactively with characters not in
  us-ascii.

Is this useful and worth the (small) bloat?

A somewhat related feature is support for \uNNNN and \UNNNNNNNN
sequences in $'...' (this will be added to POSIX, see
http://austingroupbugs.net/view.php?id=249 and I plan to add it to sh).
Ideally, these are converted using iconv(3) but as long as it is not
unconditionally available in base or if it is not supposed to be used,
the codepoints can be encoded in UTF-8 for UTF-8 locales, leaving other
locales with question marks.

-- 
Jilles Tjoelker

--ikeVEW9yuYc//A+q
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="sh-utf8.patch"

Index: parser.h
===================================================================
--- parser.h	(revision 218371)
+++ parser.h	(working copy)
@@ -34,16 +34,16 @@
  */
 
 /* control characters in argument strings */
-#define CTLESC '\201'
-#define CTLVAR '\202'
-#define CTLENDVAR '\203'
-#define CTLBACKQ '\204'
+#define CTLESC '\300'
+#define CTLVAR '\301'
+#define CTLENDVAR '\371'
+#define CTLBACKQ '\372'
 #define CTLQUOTE 01		/* ored with CTLBACKQ code if in quotes */
 /*	CTLBACKQ | CTLQUOTE == '\205' */
-#define	CTLARI	'\206'
-#define	CTLENDARI '\207'
-#define	CTLQUOTEMARK '\210'
-#define	CTLQUOTEEND '\211' /* only for ${v+-...} */
+#define	CTLARI	'\374'
+#define	CTLENDARI '\375'
+#define	CTLQUOTEMARK '\376'
+#define	CTLQUOTEEND '\377' /* only for ${v+-...} */
 
 /* variable substitution byte (follows CTLVAR) */
 #define VSTYPE		0x0f	/* type of variable substitution */
Index: sh.1
===================================================================
--- sh.1	(revision 218467)
+++ sh.1	(working copy)
@@ -2510,4 +2510,7 @@ was originally written by
 .Sh BUGS
 The
 .Nm
-utility does not recognize multibyte characters.
+utility does not recognize multibyte characters other than UTF-8.
+The line editing library
+.Xr editline 3
+does not recognize multibyte characters.
Index: expand.c
===================================================================
--- expand.c	(revision 218371)
+++ expand.c	(working copy)
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
 
 /*
  * Routines to expand arguments to commands.  We have to deal with
@@ -111,16 +112,16 @@ static void addfname(char *);
 static struct strlist *expsort(struct strlist *);
 static struct strlist *msort(struct strlist *, int);
 static char *cvtnum(int, char *);
-static int collate_range_cmp(int, int);
+static int collate_range_cmp(wchar_t, wchar_t);
 
 static int
-collate_range_cmp(int c1, int c2)
+collate_range_cmp(wchar_t c1, wchar_t c2)
 {
-	static char s1[2], s2[2];
+	static wchar_t s1[2], s2[2];
 
 	s1[0] = c1;
 	s2[0] = c2;
-	return (strcoll(s1, s2));
+	return (wcscoll(s1, s2));
 }
 
 /*
@@ -665,6 +666,7 @@ evalvar(char *p, int flag)
 	int special;
 	int startloc;
 	int varlen;
+	int varlenb;
 	int easy;
 	int quotes = flag & (EXP_FULL | EXP_CASE | EXP_REDIR);
 
@@ -712,8 +714,15 @@ again: /* jump here after setting a variable with
 		if (special) {
 			varvalue(var, varflags & VSQUOTE, subtype, flag);
 			if (subtype == VSLENGTH) {
-				varlen = expdest - stackblock() - startloc;
-				STADJUST(-varlen, expdest);
+				varlenb = expdest - stackblock() - startloc;
+				varlen = varlenb;
+				if (localeisutf8) {
+					val = stackblock() + startloc;
+					for (;val != expdest; val++)
+						if ((*val & 0xC0) == 0x80)
+							varlen--;
+				}
+				STADJUST(-varlenb, expdest);
 			}
 		} else {
 			char const *syntax = (varflags & VSQUOTE) ? DQSYNTAX
@@ -721,7 +730,9 @@ again: /* jump here after setting a variable with
 
 			if (subtype == VSLENGTH) {
 				for (;*val; val++)
-					varlen++;
+					if (!localeisutf8 ||
+					    (*val & 0xC0) != 0x80)
+						varlen++;
 			}
 			else {
 				if (quotes)
@@ -1367,6 +1378,23 @@ msort(struct strlist *list, int len)
 
 
 
+static wchar_t
+get_wc(const char **p)
+{
+	wchar_t c;
+	int chrlen;
+
+	chrlen = mbtowc(&c, *p, 4);
+	if (chrlen == 0)
+		return 0;
+	else if (chrlen == -1)
+		c = *(*p)++;
+	else
+		*p += chrlen;
+	return c;
+}
+
+
 /*
  * Returns true if the pattern matches the string.
  */
@@ -1376,6 +1404,7 @@ patmatch(const char *pattern, const char *string,
 {
 	const char *p, *q;
 	char c;
+	wchar_t wc, wc2;
 
 	p = pattern;
 	q = string;
@@ -1394,7 +1423,11 @@ patmatch(const char *pattern, const char *string,
 		case '?':
 			if (squoted && *q == CTLESC)
 				q++;
-			if (*q++ == '\0')
+			if (localeisutf8)
+				wc = get_wc(&q);
+			else
+				wc = *q++;
+			if (wc == '\0')
 				return 0;
 			break;
 		case '*':
@@ -1424,7 +1457,7 @@ patmatch(const char *pattern, const char *string,
 		case '[': {
 			const char *endp;
 			int invert, found;
-			char chr;
+			wchar_t chr;
 
 			endp = p;
 			if (*endp == '!' || *endp == '^')
@@ -1445,8 +1478,11 @@ patmatch(const char *pattern, const char *string,
 				p++;
 			}
 			found = 0;
-			chr = *q++;
-			if (squoted && chr == CTLESC)
+			if (squoted && *q == CTLESC)
+				q++;
+			if (localeisutf8)
+				chr = get_wc(&q);
+			else
 				chr = *q++;
 			if (chr == '\0')
 				return 0;
@@ -1456,19 +1492,27 @@ patmatch(const char *pattern, const char *string,
 					continue;
 				if (c == CTLESC)
 					c = *p++;
+				if (localeisutf8 && c & 0x80) {
+					p--;
+					wc = get_wc(&p);
+				} else
+					wc = c;
 				if (*p == '-' && p[1] != ']') {
 					p++;
 					while (*p == CTLQUOTEMARK)
 						p++;
 					if (*p == CTLESC)
 						p++;
-					if (   collate_range_cmp(chr, c) >= 0
-					    && collate_range_cmp(chr, *p) <= 0
+					if (localeisutf8)
+						wc2 = get_wc(&p);
+					else
+						wc2 = *p++;
+					if (   collate_range_cmp(chr, wc) >= 0
+					    && collate_range_cmp(chr, wc2) <= 0
 					   )
 						found = 1;
-					p++;
 				} else {
-					if (chr == c)
+					if (chr == wc)
 						found = 1;
 				}
 			} while ((c = *p++) != ']');
Index: main.c
===================================================================
--- main.c	(revision 218371)
+++ main.c	(working copy)
@@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$");
 int rootpid;
 int rootshell;
 struct jmploc main_handler;
+int localeisutf8;
 
 static void read_profile(const char *);
 static char *find_dot_file(char *);
@@ -96,6 +97,7 @@ main(int argc, char *argv[])
 	char *shinit;
 
 	(void) setlocale(LC_ALL, "");
+	updatecharset();
 	state = 0;
 	if (setjmp(main_handler.loc)) {
 		switch (exception) {
Index: var.c
===================================================================
--- var.c	(revision 218371)
+++ var.c	(working copy)
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
  */
 
 #include <locale.h>
+#include <langinfo.h>
 
 #include "shell.h"
 #include "output.h"
@@ -361,6 +362,7 @@ setvareq(char *s, int flags)
 			if ((vp->flags & VEXPORT) && localevar(s)) {
 				change_env(s, 1);
 				(void) setlocale(LC_ALL, "");
+				updatecharset();
 			}
 			INTON;
 			return;
@@ -379,6 +381,7 @@ setvareq(char *s, int flags)
 	if ((vp->flags & VEXPORT) && localevar(s)) {
 		change_env(s, 1);
 		(void) setlocale(LC_ALL, "");
+		updatecharset();
 	}
 	INTON;
 }
@@ -480,6 +483,7 @@ bltinsetlocale(void)
 	if (loc != NULL) {
 		setlocale(LC_ALL, loc);
 		INTON;
+		updatecharset();
 		return;
 	}
 	locdef = bltinlookup("LANG", 0);
@@ -491,6 +495,7 @@ bltinsetlocale(void)
 			setlocale(locale_categories[i], loc);
 	}
 	INTON;
+	updatecharset();
 }
 
 /*
@@ -505,13 +510,25 @@ bltinunsetlocale(void)
 	for (lp = cmdenviron ; lp ; lp = lp->next) {
 		if (localevar(lp->text)) {
 			setlocale(LC_ALL, "");
+			updatecharset();
 			return;
 		}
 	}
 	INTON;
 }
 
+/*
+ * Update the localeisutf8 flag.
+ */
+void
+updatecharset(void)
+{
+	char *charset;
 
+	charset = nl_langinfo(CODESET);
+	localeisutf8 = !strcmp(charset, "UTF-8");
+}
+
 /*
  * Generate a list of exported variables.  This routine is used to construct
  * the third argument to execve when executing a program.
@@ -656,6 +673,7 @@ exportcmd(int argc, char **argv)
 						if ((vp->flags & VEXPORT) && localevar(vp->text)) {
 							change_env(vp->text, 1);
 							(void) setlocale(LC_ALL, "");
+							updatecharset();
 						}
 						goto found;
 					}
@@ -850,6 +868,7 @@ unsetvar(const char *s)
 			if ((vp->flags & VEXPORT) && localevar(vp->text)) {
 				change_env(s, 0);
 				setlocale(LC_ALL, "");
+				updatecharset();
 			}
 			vp->flags &= ~VEXPORT;
 			vp->flags |= VUNSET;
Index: var.h
===================================================================
--- var.h	(revision 218371)
+++ var.h	(working copy)
@@ -81,6 +81,8 @@ extern struct var vhistsize;
 extern struct var vterm;
 #endif
 
+extern int localeisutf8;
+
 /*
  * The following macros access the values of the above variables.
  * They have to skip over the name.  They return the null string
@@ -112,6 +114,7 @@ char *lookupvar(const char *);
 char *bltinlookup(const char *, int);
 void bltinsetlocale(void);
 void bltinunsetlocale(void);
+void updatecharset(void);
 char **environment(void);
 int showvarscmd(int, char **);
 int exportcmd(int, char **);

--ikeVEW9yuYc//A+q--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20110225145311.GA4423>