Date: Wed, 2 Sep 2009 09:51:34 +0000 (UTC) From: Edwin Groothuis <edwin@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r196755 - user/edwin/locale/cldr/tools Message-ID: <200909020951.n829pYTN088103@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: edwin Date: Wed Sep 2 09:51:34 2009 New Revision: 196755 URL: http://svn.freebsd.org/changeset/base/196755 Log: Tool to conver the Unicode syntax into UTF-8 Added: user/edwin/locale/cldr/tools/unicode2src.pl (contents, props changed) Added: user/edwin/locale/cldr/tools/unicode2src.pl ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/edwin/locale/cldr/tools/unicode2src.pl Wed Sep 2 09:51:34 2009 (r196755) @@ -0,0 +1,144 @@ +#!/usr/bin/perl -wC + +# +# $FreeBSD$ +# + +use strict; +use XML::Parser; +use Tie::IxHash; +use Data::Dumper; +use Getopt::Long; +use Digest::SHA qw(sha1_hex); + + +if ($#ARGV < 2) { + print "Usage: $0 --cldr=<cldrdir> --unidata=<unidatadir> --etc=<etcdir> --input=<inputfile> --output=<outputfile>\n"; + exit(1); +} + +my @filter = (); + +my $CLDRDIR = undef; +my $UNIDATADIR = undef; +my $ETCDIR = undef; +my $TYPE = undef; +my $INPUT = undef; +my $OUTPUT = undef; + +my $result = GetOptions ( + "cldr=s" => \$CLDRDIR, + "unidata=s" => \$UNIDATADIR, + "etc=s" => \$ETCDIR, + "type=s" => \$TYPE, + "input=s" => \$INPUT, + "output=s" => \$OUTPUT, + ); + +my %ucd = (); +my %utf8map = (); +my %utf8aliases = (); +get_unidata($UNIDATADIR); +get_utf8map("$CLDRDIR/posix/UTF-8.cm"); +convert($INPUT, $OUTPUT); + +############################ + +sub get_unidata { + my $directory = shift; + + open(FIN, "$directory/UnicodeData.txt") + or die("Cannot open $directory/UnicodeData.txt");; + my @lines = <FIN>; + chomp(@lines); + close(FIN); + + foreach my $l (@lines) { + my @a = split(/;/, $l); + + $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name + $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code + } +} + +sub get_utf8map { + my $file = shift; + + open(FIN, $file); + my @lines = <FIN>; + close(FIN); + chomp(@lines); + + my $prev_k = undef; + my $prev_v = ""; + my $incharmap = 0; + foreach my $l (@lines) { + $l =~ s/\r//; + next if ($l =~ /^\#/); + next if ($l eq ""); + + if ($l eq "CHARMAP") { + $incharmap = 1; + next; + } + + next if (!$incharmap); + last if ($l eq "END CHARMAP"); + + $l =~ /^<([^\s]+)>\s+(.*)/; + my $k = $1; + my $v = $2; + $k =~ s/_/ /g; # unicode char string + $v =~ s/\\x//g; # UTF-8 char code + $utf8map{$k} = $v; + + $utf8aliases{$k} = $prev_k if ($prev_v eq $v); + + $prev_v = $v; + $prev_k = $k; + } +} + +sub decode_cldr { + my $s = shift; + + my $v = $utf8map{$s}; + $v = $utf8aliases{$s} if (!defined $v); + die "Cannot convert $s" if (!defined $v); + + return pack("C", hex($v)) if (length($v) == 2); + return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) + if (length($v) == 4); + return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), + hex(substr($v, 4, 2))) if (length($v) == 6); + print STDERR "Cannot convert $s\n"; + return "length = " . length($v); +} + +sub convert { + my $IN = shift; + my $OUT = shift; + + open(FIN, "$IN"); + open(FOUT, ">$OUT"); + +# print Dumper(%utf8map); + + my $l; + while (defined ($l = <FIN>)) { + chomp($l); + + if ($l =~ /^#/) { + print FOUT $l, "\n"; + next; + } + + while ($l =~ /^(.*?)<(.*?)>(.*)$/) { + $l = $1 . decode_cldr($2) . $3; + } + print FOUT $l, "\n"; + } + + close(FOUT); + close(FIN); +}
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200909020951.n829pYTN088103>