#!/usr/bin/perl -w # unicode-decomp.pl - script to generate database for java.text.Collator # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc. # # This file is part of libjava. # # This software is copyrighted work licensed under the terms of the # Libjava License. Please consult the file "LIBJAVA_LICENSE" for # details. # Code for reading UnicodeData.txt and generating the code for # gnu.java.lang.CharData. For now, the relevant Unicode definition files # are found in libjava/gnu/gcj/convert/. # # Usage: ./unicode-decomp.pl [-n] # where is obtained from www.unicode.org (named # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and # is the final location of include/java-chardecomp.h. # As of JDK 1.4, use Unicode version 3.0.0 for best results. # # If this exits with nonzero status, then you must investigate the # cause of the problem. # Diagnostics and other information to stderr. # With -n, the files are not created, but all processing still occurs. # These maps characters to their decompositions. my %canonical_decomposition = (); my %full_decomposition = (); # Handle `-n' and open output files. if ($ARGV[0] && $ARGV[0] eq '-n') { shift @ARGV; $ARGV[1] = '/dev/null'; } die "Usage: $0 " unless @ARGV == 2; open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n"; # Process the Unicode file. $| = 1; my $count = 0; print STDERR "Parsing attributes file"; while () { print STDERR "." unless $count++ % 1000; chomp; s/\r//g; my ($ch, undef, undef, undef, undef, $decomp) = split ';'; $ch = hex($ch); if ($decomp ne '') { my $is_full = 0; my @decomp = (); foreach (split (' ', $decomp)) { if (/^\<.*\>$/) { $is_full = 1; next; } push (@decomp, hex ($_)); } my $s = pack "n*", @decomp; if ($is_full) { $full_decomposition{$ch} = $s; } else { $canonical_decomposition{$ch} = $s; } } } # Now generate decomposition tables. open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n"; print STDERR "\nGenerating tables\n"; print DECOMP <