Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 3 Oct 2009 12:51:28 +0000 (UTC)
From:      Edwin Groothuis <edwin@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r197731 - in user/edwin/locale: . share usr.bin usr.bin/unicode2utf8
Message-ID:  <200910031251.n93CpSwk041131@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: edwin
Date: Sat Oct  3 12:51:28 2009
New Revision: 197731
URL: http://svn.freebsd.org/changeset/base/197731

Log:
  Add C version of unicode2utf8.
  
  Perl version: 45 seconds for 92 conversions.
  Initial C version: 25 seconds for 92 conversions.
  Current C version: 12 seconds for 92 conversions.

Added:
  user/edwin/locale/usr.bin/
  user/edwin/locale/usr.bin/unicode2utf8/
  user/edwin/locale/usr.bin/unicode2utf8/Makefile
  user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c
Modified:
  user/edwin/locale/README.locale
  user/edwin/locale/share/Makefile.def.inc

Modified: user/edwin/locale/README.locale
==============================================================================
--- user/edwin/locale/README.locale	Sat Oct  3 12:22:12 2009	(r197730)
+++ user/edwin/locale/README.locale	Sat Oct  3 12:51:28 2009	(r197731)
@@ -94,15 +94,13 @@ Finished:
   share/numericdef, share/timedef.
 - Regression check.
 - Conversion of the Unicode definitions to the UTF-8 character-set.
+  It is residing in usr.bin/unicode2utf8 and requires the file
+  posix/UTF-8.cm from the CLDR distribution.
 
 Pending:
 - Checking of the data with the CLDR (Common Locale Data Repository)
   for completeness of the current data.
 - Conversion of Makefiles for share/mklocale.
-- Conversion of the Unicode definitions to the UTF-8 character-set
-  in a C program or AWK script to make it self-hosting.  This is
-  right now a Perl script so it can't be part of the base OS build
-  yet. This tool for now lives in src/tools/tools/locale/.
 - Import of the file UTF-8.cm (from the CLDR project) and the file
   UnicodeData.txt (from the Unicode project) into the base operating
   system. These files for now live in src/tools/tools/locale/
@@ -145,7 +143,6 @@ Local configuration:
 - Add to /etc/make.conf (make sure they match your directory layout)
 	CLDRDIR=	/home/edwin/unicode/cldr/1.7.1
 	UNIDATADIR=	/home/edwin/unicode/UNIDATA/5.1.0
-	TOOLSDIR=	/home/edwin/svn/edwin/locale/cldr/tools/
 	LOCALE_DESTDIR=	/home/edwin/locale/new
 	LOCALE_SHAREOWN=edwin
 	LOCALE_SHAREGRP=edwin

Modified: user/edwin/locale/share/Makefile.def.inc
==============================================================================
--- user/edwin/locale/share/Makefile.def.inc	Sat Oct  3 12:22:12 2009	(r197730)
+++ user/edwin/locale/share/Makefile.def.inc	Sat Oct  3 12:51:28 2009	(r197731)
@@ -301,9 +301,8 @@ _TRANSLATIONAFTER_${cm}+=	 | awk '{ gsub
 # Normal makes don't need to convert from .unicode to .src
 .  if defined(FULL)
 ${ccln}.UTF-8.src: ${ccln}.unicode
-	${TOOLSDIR}/unicode2src.pl \
+	../../usr.bin/unicode2utf8/unicode2utf8 \
 	    --cldr=${CLDRDIR} \
-	    --unidata=${UNIDATADIR} \
 	    --input=${.ALLSRC} \
 	    --output=${.TARGET}
 

Added: user/edwin/locale/usr.bin/unicode2utf8/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/edwin/locale/usr.bin/unicode2utf8/Makefile	Sat Oct  3 12:51:28 2009	(r197731)
@@ -0,0 +1,15 @@
+
+#	$NetBSD: Makefile,v 1.6 2009/04/20 16:05:30 drochner Exp $
+
+PROG=		unicode2utf8
+SRCS=		unicode2utf8.c
+NO_MAN=		yes
+WARNS?=		6
+
+test:
+	./unicode2utf8 \
+	    --cldr=/home/edwin/unicode/cldr/1.7.1/ \
+	    --input=nl_NL.unicode \
+	    --output=nl_NL.UTF-8.src
+
+.include <bsd.prog.mk>

Added: user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c	Sat Oct  3 12:51:28 2009	(r197731)
@@ -0,0 +1,217 @@
+#include <sys/param.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <err.h>
+#include <errno.h>
+#include <sysexits.h>
+
+#define MAXBUF	512
+
+struct utf8map {
+	char *uniname;
+	char *utf8char;
+	int utf8len;
+	struct utf8map *next;
+};
+
+struct utf8map *utf8map_head[256];
+
+void		 usage(void);
+struct utf8map	*get_utf8map(char *dir);
+struct utf8map	*find_utf8map(char *unidata);
+void		 translate(char *file_in, char *file_out);
+
+int
+main(int argc, char **argv) {
+	char *cldr = NULL, *file_in = NULL, *file_out = NULL;
+	char ch;
+
+	/* options descriptor */
+	static struct option longopts[] = {
+		{ "cldr",	required_argument,	NULL,	1 },
+		{ "input",	required_argument,	NULL,	3 },
+		{ "output",	required_argument,	NULL,	4 },
+		{ NULL,		0,			NULL,	0 }
+	};
+
+	while ((ch = getopt_long_only(argc, argv, "", longopts, NULL)) != -1) {
+		switch (ch) {
+		case 1:
+			cldr = optarg;
+			break;
+		case 3:
+			file_in = optarg;
+			break;
+		case 4:
+			file_out = optarg;
+			break;
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	get_utf8map(cldr);
+	translate(file_in, file_out);
+}
+
+void
+translate(char *file_in, char *file_out) {
+	FILE *fin, *fout;
+	char line[MAXBUF];
+	char *p, *q1, *q2;
+	struct utf8map *map;
+
+	if ((fin = fopen(file_in, "r")) == NULL)
+		errx(EX_DATAERR, "Cannot open %s for reading.", file_in);
+	if ((fout = fopen(file_out, "w")) == NULL)
+		errx(EX_DATAERR, "Cannot open %s for writing.", file_out);
+
+	while (!feof(fin)) {
+		if (fgets(line, sizeof(line), fin) != NULL) {
+			if (line[0] == '#') {
+				fprintf(fout, "%s", line);
+				continue;
+			}
+
+			p = line;
+			while (*p != '\0') {
+				if (*p != '<') {
+					fputc(*p, fout);
+					p++;
+					continue;
+				}
+				q1 = strchr(p + 1, '>');
+				q2 = strchr(p + 1, '<');
+				if (q2 != NULL && q2 < q1)
+					errx(EX_DATAERR,
+					    "Unexpected < in line %s after %s",
+					    line, p);
+				*q1 = '\0';
+				if ((map = find_utf8map(p + 1)) ==NULL)
+					errx(EX_DATAERR,
+					    "Cannot find translation for '%s'",
+					    p + 1);
+
+				*q1 = '>';
+				p = q1 + 1;
+				fwrite(map->utf8char, map->utf8len, 1, fout);
+			}
+
+		}
+	}
+
+	fclose(fin);
+	fclose(fout);
+}
+
+struct utf8map *
+find_utf8map(char *uniname) {
+	struct utf8map *p;
+	int hashindex = uniname[strlen(uniname) - 1];
+
+	p = utf8map_head[hashindex];
+	while (p != NULL) {
+		if (strcmp(p->uniname, uniname) == 0)
+			return p;
+		// printf("'%s' - '%s'\n", p->uniname, uniname);
+		p = p->next;
+	}
+
+	return NULL;
+}
+
+struct utf8map *
+get_utf8map(char *dir) {
+	FILE *fin;
+	char filename[MAXPATHLEN];
+	char uniname[MAXBUF], utf8char[MAXBUF];
+	char *p;
+	int len, i;
+	struct utf8map *new;
+	int hashindex;
+
+	sprintf(filename, "%s/posix/UTF-8.cm", dir);
+
+	if ((fin = fopen(filename, "r")) == NULL)
+		errx(EX_DATAERR, "Cannot open UTF-8 in %s/posix", filename);
+
+	while (!feof(fin)) {
+		if (fgets(uniname, sizeof(uniname), fin) != NULL)
+			if (strncmp(uniname, "CHARMAP", 7) == 0)
+				break;
+	}
+	if (feof(fin))
+		errx(EX_DATAERR, "Didn't find initial CHARMAP magic cookie.\n");
+
+	while (!feof(fin)) {
+		if (fscanf(fin, "%s %s", uniname, utf8char) == 2) {
+			/* ^END CHARMAP$ */
+			if (strcmp(uniname, "END") == 0
+			 && strcmp(utf8char, "CHARMAP") == 0)
+				break;
+
+			/* Get rid of the _'s in the name */
+			while ((p = strchr(uniname, '_')) != NULL)
+				*p = ' ';
+			if ((p = strchr(uniname, '>')) == NULL)
+				errx(EX_DATAERR, "No trailing '>' for %s",
+				    uniname);
+			hashindex = p[-1];
+			*p = '\0';
+			if (uniname[0] != '<')
+				errx(EX_DATAERR, "No leading '<' for %s",
+				    uniname);
+
+			/* Translate hex strings into ascii-strings */
+			len = strlen(utf8char);
+			if (len % 4 != 0)
+				errx(EX_DATAERR, "Wrong length: '%s'",
+				    utf8char);
+			len /= 4;
+			for (i = 0; i < len; i++) {
+				/*
+				 * Not setting will produce wrong results for
+				 * the unicode string NULL
+				 */
+				errno = 0;
+
+				/* "\xAA" -> "AA" -> chr(hextodec("AA")) */
+				utf8char[i] = strtol(utf8char + 4 * i + 2, NULL,
+				    16);
+				if (utf8char[i] == 0 && errno != 0)
+					errx(errno,
+					    "'%s' isn't a hex digit (%d)",
+					    utf8char + 4 * i + 2, errno);
+				utf8char[len] = 0;
+			}
+
+			// printf("-%s-%s-\n", uniname, utf8char);
+			new = (struct utf8map *)malloc(sizeof(struct utf8map));
+			new->next = utf8map_head[hashindex];
+			new->uniname = strdup(uniname + 1);
+			new->utf8char = strdup(utf8char);
+			new->utf8len = len;
+			utf8map_head[hashindex] = new;
+		}
+	}
+
+	if (feof(fin))
+		errx(EX_DATAERR, "Didn't find final CHARMAP magic cookie.\n");
+
+	fclose(fin);
+
+	return NULL;
+}
+
+void
+usage(void) {
+
+	printf("Usage: unicode2utf8 --cldr=. --input=. --output=.\n");
+	exit(EX_USAGE);
+}
+



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200910031251.n93CpSwk041131>