From owner-svn-src-user@FreeBSD.ORG Sat Oct 3 12:51:28 2009 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id B9DB4106568B; Sat, 3 Oct 2009 12:51:28 +0000 (UTC) (envelope-from edwin@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id A7A9D8FC13; Sat, 3 Oct 2009 12:51:28 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n93CpSuL041136; Sat, 3 Oct 2009 12:51:28 GMT (envelope-from edwin@svn.freebsd.org) Received: (from edwin@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n93CpSwk041131; Sat, 3 Oct 2009 12:51:28 GMT (envelope-from edwin@svn.freebsd.org) Message-Id: <200910031251.n93CpSwk041131@svn.freebsd.org> From: Edwin Groothuis Date: Sat, 3 Oct 2009 12:51:28 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r197731 - in user/edwin/locale: . share usr.bin usr.bin/unicode2utf8 X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 03 Oct 2009 12:51:28 -0000 Author: edwin Date: Sat Oct 3 12:51:28 2009 New Revision: 197731 URL: http://svn.freebsd.org/changeset/base/197731 Log: Add C version of unicode2utf8. Perl version: 45 seconds for 92 conversions. Initial C version: 25 seconds for 92 conversions. Current C version: 12 seconds for 92 conversions. Added: user/edwin/locale/usr.bin/ user/edwin/locale/usr.bin/unicode2utf8/ user/edwin/locale/usr.bin/unicode2utf8/Makefile user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c Modified: user/edwin/locale/README.locale user/edwin/locale/share/Makefile.def.inc Modified: user/edwin/locale/README.locale ============================================================================== --- user/edwin/locale/README.locale Sat Oct 3 12:22:12 2009 (r197730) +++ user/edwin/locale/README.locale Sat Oct 3 12:51:28 2009 (r197731) @@ -94,15 +94,13 @@ Finished: share/numericdef, share/timedef. - Regression check. - Conversion of the Unicode definitions to the UTF-8 character-set. + It is residing in usr.bin/unicode2utf8 and requires the file + posix/UTF-8.cm from the CLDR distribution. Pending: - Checking of the data with the CLDR (Common Locale Data Repository) for completeness of the current data. - Conversion of Makefiles for share/mklocale. -- Conversion of the Unicode definitions to the UTF-8 character-set - in a C program or AWK script to make it self-hosting. This is - right now a Perl script so it can't be part of the base OS build - yet. This tool for now lives in src/tools/tools/locale/. - Import of the file UTF-8.cm (from the CLDR project) and the file UnicodeData.txt (from the Unicode project) into the base operating system. These files for now live in src/tools/tools/locale/ @@ -145,7 +143,6 @@ Local configuration: - Add to /etc/make.conf (make sure they match your directory layout) CLDRDIR= /home/edwin/unicode/cldr/1.7.1 UNIDATADIR= /home/edwin/unicode/UNIDATA/5.1.0 - TOOLSDIR= /home/edwin/svn/edwin/locale/cldr/tools/ LOCALE_DESTDIR= /home/edwin/locale/new LOCALE_SHAREOWN=edwin LOCALE_SHAREGRP=edwin Modified: user/edwin/locale/share/Makefile.def.inc ============================================================================== --- user/edwin/locale/share/Makefile.def.inc Sat Oct 3 12:22:12 2009 (r197730) +++ user/edwin/locale/share/Makefile.def.inc Sat Oct 3 12:51:28 2009 (r197731) @@ -301,9 +301,8 @@ _TRANSLATIONAFTER_${cm}+= | awk '{ gsub # Normal makes don't need to convert from .unicode to .src . if defined(FULL) ${ccln}.UTF-8.src: ${ccln}.unicode - ${TOOLSDIR}/unicode2src.pl \ + ../../usr.bin/unicode2utf8/unicode2utf8 \ --cldr=${CLDRDIR} \ - --unidata=${UNIDATADIR} \ --input=${.ALLSRC} \ --output=${.TARGET} Added: user/edwin/locale/usr.bin/unicode2utf8/Makefile ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/edwin/locale/usr.bin/unicode2utf8/Makefile Sat Oct 3 12:51:28 2009 (r197731) @@ -0,0 +1,15 @@ + +# $NetBSD: Makefile,v 1.6 2009/04/20 16:05:30 drochner Exp $ + +PROG= unicode2utf8 +SRCS= unicode2utf8.c +NO_MAN= yes +WARNS?= 6 + +test: + ./unicode2utf8 \ + --cldr=/home/edwin/unicode/cldr/1.7.1/ \ + --input=nl_NL.unicode \ + --output=nl_NL.UTF-8.src + +.include Added: user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/edwin/locale/usr.bin/unicode2utf8/unicode2utf8.c Sat Oct 3 12:51:28 2009 (r197731) @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXBUF 512 + +struct utf8map { + char *uniname; + char *utf8char; + int utf8len; + struct utf8map *next; +}; + +struct utf8map *utf8map_head[256]; + +void usage(void); +struct utf8map *get_utf8map(char *dir); +struct utf8map *find_utf8map(char *unidata); +void translate(char *file_in, char *file_out); + +int +main(int argc, char **argv) { + char *cldr = NULL, *file_in = NULL, *file_out = NULL; + char ch; + + /* options descriptor */ + static struct option longopts[] = { + { "cldr", required_argument, NULL, 1 }, + { "input", required_argument, NULL, 3 }, + { "output", required_argument, NULL, 4 }, + { NULL, 0, NULL, 0 } + }; + + while ((ch = getopt_long_only(argc, argv, "", longopts, NULL)) != -1) { + switch (ch) { + case 1: + cldr = optarg; + break; + case 3: + file_in = optarg; + break; + case 4: + file_out = optarg; + break; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + get_utf8map(cldr); + translate(file_in, file_out); +} + +void +translate(char *file_in, char *file_out) { + FILE *fin, *fout; + char line[MAXBUF]; + char *p, *q1, *q2; + struct utf8map *map; + + if ((fin = fopen(file_in, "r")) == NULL) + errx(EX_DATAERR, "Cannot open %s for reading.", file_in); + if ((fout = fopen(file_out, "w")) == NULL) + errx(EX_DATAERR, "Cannot open %s for writing.", file_out); + + while (!feof(fin)) { + if (fgets(line, sizeof(line), fin) != NULL) { + if (line[0] == '#') { + fprintf(fout, "%s", line); + continue; + } + + p = line; + while (*p != '\0') { + if (*p != '<') { + fputc(*p, fout); + p++; + continue; + } + q1 = strchr(p + 1, '>'); + q2 = strchr(p + 1, '<'); + if (q2 != NULL && q2 < q1) + errx(EX_DATAERR, + "Unexpected < in line %s after %s", + line, p); + *q1 = '\0'; + if ((map = find_utf8map(p + 1)) ==NULL) + errx(EX_DATAERR, + "Cannot find translation for '%s'", + p + 1); + + *q1 = '>'; + p = q1 + 1; + fwrite(map->utf8char, map->utf8len, 1, fout); + } + + } + } + + fclose(fin); + fclose(fout); +} + +struct utf8map * +find_utf8map(char *uniname) { + struct utf8map *p; + int hashindex = uniname[strlen(uniname) - 1]; + + p = utf8map_head[hashindex]; + while (p != NULL) { + if (strcmp(p->uniname, uniname) == 0) + return p; + // printf("'%s' - '%s'\n", p->uniname, uniname); + p = p->next; + } + + return NULL; +} + +struct utf8map * +get_utf8map(char *dir) { + FILE *fin; + char filename[MAXPATHLEN]; + char uniname[MAXBUF], utf8char[MAXBUF]; + char *p; + int len, i; + struct utf8map *new; + int hashindex; + + sprintf(filename, "%s/posix/UTF-8.cm", dir); + + if ((fin = fopen(filename, "r")) == NULL) + errx(EX_DATAERR, "Cannot open UTF-8 in %s/posix", filename); + + while (!feof(fin)) { + if (fgets(uniname, sizeof(uniname), fin) != NULL) + if (strncmp(uniname, "CHARMAP", 7) == 0) + break; + } + if (feof(fin)) + errx(EX_DATAERR, "Didn't find initial CHARMAP magic cookie.\n"); + + while (!feof(fin)) { + if (fscanf(fin, "%s %s", uniname, utf8char) == 2) { + /* ^END CHARMAP$ */ + if (strcmp(uniname, "END") == 0 + && strcmp(utf8char, "CHARMAP") == 0) + break; + + /* Get rid of the _'s in the name */ + while ((p = strchr(uniname, '_')) != NULL) + *p = ' '; + if ((p = strchr(uniname, '>')) == NULL) + errx(EX_DATAERR, "No trailing '>' for %s", + uniname); + hashindex = p[-1]; + *p = '\0'; + if (uniname[0] != '<') + errx(EX_DATAERR, "No leading '<' for %s", + uniname); + + /* Translate hex strings into ascii-strings */ + len = strlen(utf8char); + if (len % 4 != 0) + errx(EX_DATAERR, "Wrong length: '%s'", + utf8char); + len /= 4; + for (i = 0; i < len; i++) { + /* + * Not setting will produce wrong results for + * the unicode string NULL + */ + errno = 0; + + /* "\xAA" -> "AA" -> chr(hextodec("AA")) */ + utf8char[i] = strtol(utf8char + 4 * i + 2, NULL, + 16); + if (utf8char[i] == 0 && errno != 0) + errx(errno, + "'%s' isn't a hex digit (%d)", + utf8char + 4 * i + 2, errno); + utf8char[len] = 0; + } + + // printf("-%s-%s-\n", uniname, utf8char); + new = (struct utf8map *)malloc(sizeof(struct utf8map)); + new->next = utf8map_head[hashindex]; + new->uniname = strdup(uniname + 1); + new->utf8char = strdup(utf8char); + new->utf8len = len; + utf8map_head[hashindex] = new; + } + } + + if (feof(fin)) + errx(EX_DATAERR, "Didn't find final CHARMAP magic cookie.\n"); + + fclose(fin); + + return NULL; +} + +void +usage(void) { + + printf("Usage: unicode2utf8 --cldr=. --input=. --output=.\n"); + exit(EX_USAGE); +} +