convert_ucd.pl - pub/scm/libs/libucd/libucd - Git at Google

 #!/usr/bin/perl
 #
 # Perl script to convert the Unicode Character Database source files
 # into data for libucd.
 #

 #
 # Internally this file uses a hash with the UCS value as key, and
 # as data another hash from property name to value.
 #

 %ucs_props = ();

 sub parse_unicode_string($) {
     my ($str) = @_;
     my @str = split(/\s+/, $str, 0);
     my @xv = ();
     my $x;

     foreach $x ( @str ) {
 	push(@xv, hex $x);
     }

     return pack("U*", @xv);
 }

 #
 # File to read a UCD file with a list of properties (no names)
 #
 sub read_separated_file($$$) {
     my ($filename, $proplist, $default) = @_;
     my $fh;
     my $line, @fields, $c0, $c1, $c;
     my $was_first = 0;

     print STDERR "Reading $filename\n";
     open($fh, '<', $filename) or return 0;
     while ( defined($line = <$fh>) ) {
 	chomp $line;
 	$line =~ s/\s*(|\#.*)$//;
 	@fields = split(/\s*\;\s*/, $line);

 	if ( $fields[0] =~ /^([0-9a-f]+)(|..([0-9a-f]+))$/i ) {
 	    if ( $was_first ) {
 		$c1 = hex $1;
 		$was_first = 0;
 	    } else {
 		$c0 = hex $1;
 		$c1 = ($2 eq '') ? $c0 : hex $3;
 	    }

 	    for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
 		my $p, $f;

 		$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );

 		for ( $f = 1 ; $f < scalar(@fields) ; $f++ ) {
 		    my $prop = ${$proplist}[$f-1];
 		    if ( defined($prop) ) {
 			my $type = substr($prop, 0, 1);
 			my $prop = substr($prop, 1);
 			my $def = ${$default}[$f-1];
 			$def = sprintf("%04X", $c) if ( $def eq '=' );
 			my $val = $fields[$f];
 			$val = $def if ( $val eq '' );

 			if ( $type eq 'b' ) {
 			    # Boolean (Y/N)
 			    $val = ($val eq 'N') ? 0 : 1;
 			} elsif ( $type eq 'p' ) {
 			    # Code point
 			    $val = hex $val;
 			} elsif ( $type eq 's' ) {
 			    # String of code points
 			    $val = parse_unicode_string($val);
 			} elsif ( $type eq '!' ) {
 			    # Special case
 			    if ( $prop eq 'Name' ) {
 				# In UnicodeData.txt, ranges aren't encoded the same way
 				# as elsewhere, instead the first and last code point are
 				# entered, with <..., first> or <..., last> as the name
 				if ( $val =~ /^\<.*, First\>$/ ) {
 				    $was_first = 1;
 				}
 				$val = undef if ( $val =~ /^\<.*\>$/ );
 			    } elsif ( $prop eq 'Decomposition' ) {
 				$prop = 'Decomposition_Mapping';
 				if ( $val =~ /^(\<([a-z0-9]+)\>\s*|)([0-9a-f\s]+)$/i ) {
 				    my $dct = $2 || 'canonical';
 				    $val = parse_unicode_string($3);

 				    ${$ucs_props{$c}}{'Decomposition_Type'} = $dct;
 				}
 			    } else {
 				die "$0: Unknown special: $prop\n";
 			    }
 			}

 			${$ucs_props{$c}}{$prop} = $val;
 		    }
 		}
 	    }
 	}
     }
     close($fh);

     return 1;
 }

 #
 # File to read a UCD file with boolean properties
 #
 sub read_boolean_file($) {
     my ($filename) = @_;
     my $fh;
     my $line, @fields, $c0, $c1, $c;

     print STDERR "Reading $filename\n";
     open($fh, '<', $filename) or return 0;
     while ( defined($line = <$fh>) ) {
 	chomp $line;
 	$line =~ s/\s*(|\#.*)$//;
 	@fields = split(/\s*\;\s*/, $line);

 	if ( $fields[0] =~ /^([0-9A-F]+)(|..([0-9A-F]+))$/ &&
 	     $fields[1] ne '' ) {
 	    $c0 = hex $1;
 	    $c1 = ($2 eq '') ? $c0 : hex $3;

 	    for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
 		my $p, $f;

 		$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );

 		${$ucs_props{$c}}{$fields[1]} = 1;
 	    }
 	}
     }
     close($fh);

     return 1;
 }

 sub make_jamo_string($) {
     my ($s) = @_;
     my $i, $c;
     my $o = '';

     $o .= "{";
     for ( $i = 0 ; $i < 4 ; $i++ ) {
 	$c = substr($s,$i,1);
 	$o .= length($c) ? "\'$c\'" : '0';
 	$o .= ($i == 3) ? '}' : ',';
     }

     return $o;
 }

 # This produces tables used to generate the systematic Hangul syllables
 sub make_jamo_tables() {
     my $LBase = 0x1100;
     my $VBase = 0x1161;
     my $TBase = 0x11A7;
     my $LCount = 19;
     my $VCount = 21;
     my $TCount = 28;
     my $i;
     my $fh;

     # None of the syllables exceed 4 bytes, so let's not waste
     # pointer space that might have to be relocated...

     print STDERR "Writing gen/jamo.c\n";
     open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c";
     print $fh "#include \"libucd_int.h\"\n\n";

     print $fh "const char _libucd_hangul_jamo_l[$LCount][4] = {\n";
     for ( $i = 0 ; $i < $LCount ; $i++ ) {
 	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$LBase+$i}}{'Jamo_Short_Name'});
     }
     print $fh "};\n";
     print $fh "const char _libucd_hangul_jamo_v[$VCount][4] = {\n";
     for ( $i = 0 ; $i < $VCount ; $i++ ) {
 	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$VBase+$i}}{'Jamo_Short_Name'});
     }
     print $fh "};\n";
     print $fh "const char _libucd_hangul_jamo_t[$TCount][4] = {\n";
     for ( $i = 0 ; $i < $TCount ; $i++ ) {
 	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$TBase+$i}}{'Jamo_Short_Name'});
     }
     print $fh "};\n";

     close($fh);
 }

 # This produces a names list sorted by UCS, and produces a reverse map.
 %name_to_ucs   = ();

 sub make_names_list() {
     my $k;
     my $pos = 0;
     my $fh;
     my $col;

     print STDERR "Writing gen/nameslist.tab\n";
     open($fh, '>', 'gen/nameslist.tab') or die;

     foreach $k ( sort {$a <=> $b} (keys(%ucs_props)) ) {
 	print STDERR "Not a number: \"$k\"\n" if ( $k ne ($k+0) );

 	my $n = ${$ucs_props{$k}}{'Name'};
 	if ( defined($n) ) {
 	    if ( defined($name_to_ucs{$n}) ) {
 		printf STDERR "WARNING: Name \"%s\" duplicated from U+%04X to U+%04X\n",
 		$n, $k, $name_to_ucs{$n};
 	    } else {
 		$name_to_ucs{$n} = $k;
 		printf $fh "%05x %s\n", $k, $n;
 	    }
 	}
     }
     close($fh);
 }

 #
 # Produce a list of names for automatic hash table generation.
 # This includes the Hangul syllables, but not systematically
 # named CJK.
 #
 sub write_hangul_names($$)
 {
     my ($fh, $fht) = @_;
     my $SBase = 0xAC00;
     my $LBase = 0x1100;
     my $VBase = 0x1161;
     my $TBase = 0x11A7;
     my $LCount = 19;
     my $VCount = 21;
     my $TCount = 28;
     my $SCount = $LCount*$VCount*$TCount;
     my $l, $v, $t, $c;

     $c = $SBase;
     for ( $l = 0 ; $l < $LCount ; $l++ ) {
 	for ( $v = 0 ; $v < $VCount ; $v++ ) {
 	    for ( $t = 0 ; $t < $TCount ; $t++) {
 		my $name = sprintf("HANGUL SYLLABLE %s%s%s",
 				   ${$ucs_props{$LBase+$l}}{'Jamo_Short_Name'},
 				   ${$ucs_props{$VBase+$v}}{'Jamo_Short_Name'},
 				   ${$ucs_props{$TBase+$t}}{'Jamo_Short_Name'});
 		printf $fh "%s\n", $name;
 		printf $fht "%05x %s\n", $c, $name;

 		$c++;
 	    }
 	}
     }
 }

 sub make_name_keyfile()
 {
     my $fh, $fht;
     my $k;

     print STDERR "Writing gen/nametoucs.keys and gen/nametoucs.tab\n";
     open($fh, '>', 'gen/nametoucs.keys') or die;
     open($fht, '>', 'gen/nametoucs.tab') or die;

     foreach $k ( keys(%name_to_ucs) ) {
 	printf $fh "%s\n", $k;
 	printf $fht "%05x %s\n", $name_to_ucs{$k}, $k;
     }

     write_hangul_names($fh, $fht);

     close($fh);
     close($fht);
 }

 #
 # Make a keyfile for all non-systematically named codepoints
 #
 sub make_named_ucs_keyfile()
 {
     my $fh;
     my $k;

     print STDERR "Writing gen/ucstoname.keys\n";
     open($fh, '>', 'gen/ucstoname.keys')
 	or die "$0: cannot write gen/ucstoname.keys\n";

     foreach $k ( values(%name_to_ucs) ) {
 	printf $fh "%08x\n", $k;
     }

     close($fh);
 }

 #
 # Produce a list of character properties, sans names; this is
 # a test in order to figure out how much we could save from a
 # range-oriented table for everything except names.
 #
 sub dump_prop_list()
 {
     my $fh, $c;

     print STDERR "Writing gen/propdump.txt\n";
     open($fh, '>', 'gen/propdump.txt')
 	or die "$0: cannot write gen/propdump.txt\n";
     binmode $fh, ':utf8';

     for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
 	my %h = %{$ucs_props{$c}};

 	# Handle these separately
 	delete $h{'Name'};
 	delete $h{'Unicode_1_Name'};
 	delete $h{'ISO_Comment'};
 	delete $h{'Decomposition_Mapping'};
 	# delete $h{'Uppercase_Mapping'};
 	# delete $h{'Lowercase_Mapping'};
 	# delete $h{'Titlecase_Mapping'};
 	# delete $h{'Special_Case_Condition'};
 	delete $h{'Jamo_Short_Name'};

 	# Store these as offsets.
 	my $k;
 	foreach $k ( 'Simple_Uppercase_Mapping',
 		     'Simple_Lowercase_Mapping',
 		     'Simple_Titlecase_Mapping' ) {
 	    if ( defined($h{$k}) ) {
 		$h{$k} -= $c;	# Convert to offset
 	    } else {
 		$h{$k} = 0;	# Default is zero offset
 	    }
 	}

 	my @l = sort(keys(%h));
 	my $p;
 	printf $fh "%05X ", $c;
 	foreach $p ( @l ) {
 	    print $fh $p,':',$h{$p},';';
 	}
 	print $fh "\n";
     }
 }

 #
 # Produce the properties array
 #
 %prop_array_position = ();

 sub emit_int24($) {
     my($v) = @_;
     return sprintf("{0x%02x, 0x%02x, 0x%02x}",
 		    $v & 0xff,
 		    ($v >> 8) & 0xff,
 		    ($v >> 16) & 0xff);
 }

 sub make_properties_array()
 {
     my $fh, $fhi, $c, $prev, $mine, $cnt, $cp;

     # List of boolean properties that translate 1:1 into flags
     my @boolean_props = ('Composition_Exclusion', 'Alphabetic', 'Default_Ignorable_Code_Point',
 			 'Lowercase', 'Grapheme_Base', 'Grapheme_Extend', 'ID_Start', 'ID_Continue',
 			 'Math', 'Uppercase', 'XID_Start', 'XID_Continue', 'Hex_Digit',
 			 'Bidi_Control', 'Dash', 'Deprecated', 'Diacritic', 'Extender',
 			 'Grapheme_Link', 'Ideographic', 'IDS_Binary_Operator',
 			 'IDS_Trinary_Operator', 'Join_Control', 'Logical_Order_Exception',
 			 'Noncharacter_Code_Point', 'Pattern_Syntax', 'Pattern_White_Space',
 			 'Quotation_Mark', 'Radical', 'Soft_Dotted', 'STerm',
 			 'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector',
 			 'White_Space', 'Bidi_Mirrored');

     print STDERR "Writing gen/proparray.c and gen/proparrayindex\n";
     open($fh, '>', 'gen/proparray.c') or die;
     open($fhi, '>', 'gen/proparrayindex') or die;
     binmode $fh, ':utf8';

     undef $prev;
     $cnt = 0;

     for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
 	$cp = $ucs_props{$c};
 	# Careful with the formatting: we rely on the fact that
 	# the first 14 characters contain the UCS value and the rest
 	# the properties.

 	# Code point UCS value
 	$mine = sprintf("\t{\n\t\t0x%05x,\n", $c);

 	# General category
 	my $gc = $$cp{'General_Category'} || 'Cn';
 	$mine .= "\t\tUC_GC_$gc,\n";

 	# Script
 	my $sc = $$cp{'Script'} || 'Common';
 	$mine .= "\t\tUC_SC_$sc,\n";

 	# Numeric value
 	my $nv = $$cp{'Numeric_Value'};
 	if ( $nv > 255 ) {
 	    my $exp = int(log($nv)/log(10))-1;
 	    my $num = int($nv/(10**$exp));
 	    $mine .= "\t\t$num, 128+$exp,\n";
 	} else {
 	    my $num = $nv + 0;
 	    my $den = 1;

 	    if ( $nv != 0 ) {
 		while ( ($nv-($num/$den))/$nv > 1e-7 ) {
 		    $den++;
 		    $num = int($nv*$den+0.5);
 		}
 	    }
 	    $mine .= "\t\t$num, $den,\n";
 	}

 	# Boolean properties and block index
 	my $bp;
 	foreach $bp ( @boolean_props ) {
 	    if ( $$cp{$bp} ) {
 		$mine .= "\t\tUC_FL_\U$bp\E |\n";
 	    }
 	}
 	my $block = $$cp{'Block'} || 'No_Block';
 	$block =~ tr/ .-/___/;
 	$mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n";

 	# Simple case mappings
 	my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c;
 	$mine .= "\t\t".emit_int24($sum).",\n";
 	my $slm = ($$cp{'Simple_Lowercase_Mapping'} || $c) - $c;
 	$mine .= "\t\t".emit_int24($slm).",\n";
 	my $stm = ($$cp{'Simple_Titlecase_Mapping'} || $c) - $c;
 	$mine .= "\t\t".emit_int24($stm).",\n";

 	# Age (assume 31.7 as maximum; Unicode has traditionally not had
 	# many minor versions per major version.)
 	my $age = $$cp{'Age'} || '0.0';
 	my (@sage) = split(/\./, $age);
 	$mine .= sprintf("\t\t(%d << 3) + %d, /* $age */\n", $sage[0], $sage[1]);

 	# Canonical Combining Class
 	my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR';
 	if ( $ccc =~ /^[0-9]+$/ ) {
 	    $mine .= "\t\t$ccc,\n"; # Numeric CCC
 	} else {
 	    $mine .= "\t\tUC_CCC_$ccc,\n";
 	}

 	# Sentence Break
 	my $sb = $$cp{'Sentence_Break'} || 'Other';
 	$mine .= "\t\tUC_SB_$sb,\n";

 	# Grapheme Cluster Break
 	my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other';
 	$mine .= "\t\tUC_GCB_$gcb,\n";

 	# Word Break
 	my $wb = $$cp{'Word_Break'} || 'Other';
 	$mine .= "\t\tUC_WB_$wb,\n";

 	# Arabic Joining Type
 	my $ajt = $$cp{'Joining_Type'} ||
 	    ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U';
 	$mine .= "\t\tUC_JT_$ajt,\n";

 	# Arabic Joining Group
 	my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group';
 	$ajg =~ tr/ /_/;
 	$ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
 	$mine .= "\t\tUC_JG_$ajg,\n";

 	# East Asian Width
 	my $ea = $$cp{'East_Asian_Width'} || 'N';
 	$mine .= "\t\tUC_EA_$ea,\n";

 	# Hangul Syllable Type
 	my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA';
 	$mine .= "\t\tUC_HST_$hst,\n";

 	# Line Break
 	my $lb = $$cp{'Line_Break'} || 'XX';
 	$mine .= "\t\tUC_LB_$lb,\n";

 	# Numeric Type
 	my $nt = $$cp{'Numeric_Type'} || 'None';
 	$mine .= "\t\tUC_NT_$nt,\n";

 	# Bidi Class
 	my $bc = $$cp{'Bidi_Class'} || 'L';
 	$mine .= "\t\tUC_BC_$bc,\n";

 	# Additional properties...
 	$mine .= "\t},\n";

 	if ( substr($prev,14) ne substr($mine,14) ) {
 	    print $fh $mine;
 	    $cnt++;
 	    $prev = $mine;
 	    printf $fhi "0x%05x $cnt\n", $c, $cnt;
 	}
 	$prop_array_position{$c} = $cnt;
     }
     print $fh "\t/* Total: $cnt ranges */\n";

     close($fh);
     close($fhi);
 }

 #
 # Import files
 #
 read_separated_file('ucd/UnicodeData.txt',
 		    ['!Name', 'eGeneral_Category', 'nCanonical_Combining_Class',
 		     'eBidi_Class', '!Decomposition', undef, undef,
 		     'eNumeric_Value', 'bBidi_Mirrored',
 		     'mUnicode_1_Name', 'mISO_Comment', 'pSimple_Uppercase_Mapping',
 		     'pSimple_Lowercase_Mapping', 'pSimple_Titlecase_Mapping'],
 		    ['<reserved>', 'Cn', 0, undef, undef, undef, undef, undef,
 		     'N', undef, undef, '=', '=', '=']);

 read_separated_file('ucd/extracted/DerivedNumericType.txt', ['eNumeric_Type'], []);
 read_separated_file('ucd/extracted/DerivedNumericValues.txt', ['eNumeric_Value'], []);
 read_separated_file('ucd/extracted/DerivedBidiClass.txt', ['eBidi_Class'], ['L']);
 read_separated_file('ucd/ArabicShaping.txt', [undef, 'eJoining_Type', 'eJoining_Group'], []);
 read_separated_file('ucd/BidiMirroring.txt', ['pBidi_Mirroring_Glyph'], []);
 read_separated_file('ucd/Blocks.txt', ['cBlock'], []);
 read_separated_file('ucd/CompositionExclusions.txt', 'bComposition_Exclusion', []);
 # read_separated_file('ucd/CaseFolding.txt', ['eCase_Folding_Type', 'sCase_Folding'], []);
 read_separated_file('ucd/DerivedAge.txt', ['cAge'], []);
 read_separated_file('ucd/EastAsianWidth.txt', ['eEast_Asian_Width'], []);
 read_separated_file('ucd/HangulSyllableType.txt', ['eHangul_Syllable_Type'], []);
 read_separated_file('ucd/LineBreak.txt', ['eLine_Break'], []);
 read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']);
 read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping',
 					  'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []);
 read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []);
 read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []);
 read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []);
 read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []);
 read_boolean_file('ucd/DerivedCoreProperties.txt');
 read_boolean_file('ucd/PropList.txt');

 #
 # Produce output
 #
 make_jamo_tables();
 make_names_list();
 make_name_keyfile();
 make_named_ucs_keyfile();
 make_properties_array();
 # dump_prop_list();
	#!/usr/bin/perl
	#
	# Perl script to convert the Unicode Character Database source files
	# into data for libucd.
	#

	#
	# Internally this file uses a hash with the UCS value as key, and
	# as data another hash from property name to value.
	#

	%ucs_props = ();

	sub parse_unicode_string($) {
	my ($str) = @_;
	my @str = split(/\s+/, $str, 0);
	my @xv = ();
	my $x;

	foreach $x ( @str ) {
	push(@xv, hex $x);
	}

	return pack("U*", @xv);
	}

	#
	# File to read a UCD file with a list of properties (no names)
	#
	sub read_separated_file($$$) {
	my ($filename, $proplist, $default) = @_;
	my $fh;
	my $line, @fields, $c0, $c1, $c;
	my $was_first = 0;

	print STDERR "Reading $filename\n";
	open($fh, '<', $filename) or return 0;
	while ( defined($line = <$fh>) ) {
	chomp $line;
	$line =~ s/\s(\|\#.)$//;
	@fields = split(/\s\;\s/, $line);

	if ( $fields[0] =~ /^([0-9a-f]+)(\|..([0-9a-f]+))$/i ) {
	if ( $was_first ) {
	$c1 = hex $1;
	$was_first = 0;
	} else {
	$c0 = hex $1;
	$c1 = ($2 eq '') ? $c0 : hex $3;
	}

	for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
	my $p, $f;

	$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );

	for ( $f = 1 ; $f < scalar(@fields) ; $f++ ) {
	my $prop = ${$proplist}[$f-1];
	if ( defined($prop) ) {
	my $type = substr($prop, 0, 1);
	my $prop = substr($prop, 1);
	my $def = ${$default}[$f-1];
	$def = sprintf("%04X", $c) if ( $def eq '=' );
	my $val = $fields[$f];
	$val = $def if ( $val eq '' );

	if ( $type eq 'b' ) {
	# Boolean (Y/N)
	$val = ($val eq 'N') ? 0 : 1;
	} elsif ( $type eq 'p' ) {
	# Code point
	$val = hex $val;
	} elsif ( $type eq 's' ) {
	# String of code points
	$val = parse_unicode_string($val);
	} elsif ( $type eq '!' ) {
	# Special case
	if ( $prop eq 'Name' ) {
	# In UnicodeData.txt, ranges aren't encoded the same way
	# as elsewhere, instead the first and last code point are
	# entered, with <..., first> or <..., last> as the name
	if ( $val =~ /^\<.*, First\>$/ ) {
	$was_first = 1;
	}
	$val = undef if ( $val =~ /^\<.*\>$/ );
	} elsif ( $prop eq 'Decomposition' ) {
	$prop = 'Decomposition_Mapping';
	if ( $val =~ /^(\<([a-z0-9]+)\>\s*\|)([0-9a-f\s]+)$/i ) {
	my $dct = $2 \|\| 'canonical';
	$val = parse_unicode_string($3);

	${$ucs_props{$c}}{'Decomposition_Type'} = $dct;
	}
	} else {
	die "$0: Unknown special: $prop\n";
	}
	}

	${$ucs_props{$c}}{$prop} = $val;
	}
	}
	}
	}
	}
	close($fh);

	return 1;
	}

	#
	# File to read a UCD file with boolean properties
	#
	sub read_boolean_file($) {
	my ($filename) = @_;
	my $fh;
	my $line, @fields, $c0, $c1, $c;

	print STDERR "Reading $filename\n";
	open($fh, '<', $filename) or return 0;
	while ( defined($line = <$fh>) ) {
	chomp $line;
	$line =~ s/\s(\|\#.)$//;
	@fields = split(/\s\;\s/, $line);

	if ( $fields[0] =~ /^([0-9A-F]+)(\|..([0-9A-F]+))$/ &&
	$fields[1] ne '' ) {
	$c0 = hex $1;
	$c1 = ($2 eq '') ? $c0 : hex $3;

	for ( $c = $c0 ; $c <= $c1 ; $c++ ) {
	my $p, $f;

	$ucs_props{$c} = {} unless ( exists($ucs_props{$c}) );

	${$ucs_props{$c}}{$fields[1]} = 1;
	}
	}
	}
	close($fh);

	return 1;
	}

	sub make_jamo_string($) {
	my ($s) = @_;
	my $i, $c;
	my $o = '';

	$o .= "{";
	for ( $i = 0 ; $i < 4 ; $i++ ) {
	$c = substr($s,$i,1);
	$o .= length($c) ? "\'$c\'" : '0';
	$o .= ($i == 3) ? '}' : ',';
	}

	return $o;
	}

	# This produces tables used to generate the systematic Hangul syllables
	sub make_jamo_tables() {
	my $LBase = 0x1100;
	my $VBase = 0x1161;
	my $TBase = 0x11A7;
	my $LCount = 19;
	my $VCount = 21;
	my $TCount = 28;
	my $i;
	my $fh;

	# None of the syllables exceed 4 bytes, so let's not waste
	# pointer space that might have to be relocated...

	print STDERR "Writing gen/jamo.c\n";
	open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c";
	print $fh "#include \"libucd_int.h\"\n\n";

	print $fh "const char _libucd_hangul_jamo_l[$LCount][4] = {\n";
	for ( $i = 0 ; $i < $LCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$LBase+$i}}{'Jamo_Short_Name'});
	}
	print $fh "};\n";
	print $fh "const char _libucd_hangul_jamo_v[$VCount][4] = {\n";
	for ( $i = 0 ; $i < $VCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$VBase+$i}}{'Jamo_Short_Name'});
	}
	print $fh "};\n";
	print $fh "const char _libucd_hangul_jamo_t[$TCount][4] = {\n";
	for ( $i = 0 ; $i < $TCount ; $i++ ) {
	printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$TBase+$i}}{'Jamo_Short_Name'});
	}
	print $fh "};\n";

	close($fh);
	}

	# This produces a names list sorted by UCS, and produces a reverse map.
	%name_to_ucs = ();

	sub make_names_list() {
	my $k;
	my $pos = 0;
	my $fh;
	my $col;

	print STDERR "Writing gen/nameslist.tab\n";
	open($fh, '>', 'gen/nameslist.tab') or die;

	foreach $k ( sort {$a <=> $b} (keys(%ucs_props)) ) {
	print STDERR "Not a number: \"$k\"\n" if ( $k ne ($k+0) );

	my $n = ${$ucs_props{$k}}{'Name'};
	if ( defined($n) ) {
	if ( defined($name_to_ucs{$n}) ) {
	printf STDERR "WARNING: Name \"%s\" duplicated from U+%04X to U+%04X\n",
	$n, $k, $name_to_ucs{$n};
	} else {
	$name_to_ucs{$n} = $k;
	printf $fh "%05x %s\n", $k, $n;
	}
	}
	}
	close($fh);
	}

	#
	# Produce a list of names for automatic hash table generation.
	# This includes the Hangul syllables, but not systematically
	# named CJK.
	#
	sub write_hangul_names($$)
	{
	my ($fh, $fht) = @_;
	my $SBase = 0xAC00;
	my $LBase = 0x1100;
	my $VBase = 0x1161;
	my $TBase = 0x11A7;
	my $LCount = 19;
	my $VCount = 21;
	my $TCount = 28;
	my $SCount = $LCount$VCount$TCount;
	my $l, $v, $t, $c;

	$c = $SBase;
	for ( $l = 0 ; $l < $LCount ; $l++ ) {
	for ( $v = 0 ; $v < $VCount ; $v++ ) {
	for ( $t = 0 ; $t < $TCount ; $t++) {
	my $name = sprintf("HANGUL SYLLABLE %s%s%s",
	${$ucs_props{$LBase+$l}}{'Jamo_Short_Name'},
	${$ucs_props{$VBase+$v}}{'Jamo_Short_Name'},
	${$ucs_props{$TBase+$t}}{'Jamo_Short_Name'});
	printf $fh "%s\n", $name;
	printf $fht "%05x %s\n", $c, $name;

	$c++;
	}
	}
	}
	}

	sub make_name_keyfile()
	{
	my $fh, $fht;
	my $k;

	print STDERR "Writing gen/nametoucs.keys and gen/nametoucs.tab\n";
	open($fh, '>', 'gen/nametoucs.keys') or die;
	open($fht, '>', 'gen/nametoucs.tab') or die;

	foreach $k ( keys(%name_to_ucs) ) {
	printf $fh "%s\n", $k;
	printf $fht "%05x %s\n", $name_to_ucs{$k}, $k;
	}

	write_hangul_names($fh, $fht);

	close($fh);
	close($fht);
	}

	#
	# Make a keyfile for all non-systematically named codepoints
	#
	sub make_named_ucs_keyfile()
	{
	my $fh;
	my $k;

	print STDERR "Writing gen/ucstoname.keys\n";
	open($fh, '>', 'gen/ucstoname.keys')
	or die "$0: cannot write gen/ucstoname.keys\n";

	foreach $k ( values(%name_to_ucs) ) {
	printf $fh "%08x\n", $k;
	}

	close($fh);
	}

	#
	# Produce a list of character properties, sans names; this is
	# a test in order to figure out how much we could save from a
	# range-oriented table for everything except names.
	#
	sub dump_prop_list()
	{
	my $fh, $c;

	print STDERR "Writing gen/propdump.txt\n";
	open($fh, '>', 'gen/propdump.txt')
	or die "$0: cannot write gen/propdump.txt\n";
	binmode $fh, ':utf8';

	for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
	my %h = %{$ucs_props{$c}};

	# Handle these separately
	delete $h{'Name'};
	delete $h{'Unicode_1_Name'};
	delete $h{'ISO_Comment'};
	delete $h{'Decomposition_Mapping'};
	# delete $h{'Uppercase_Mapping'};
	# delete $h{'Lowercase_Mapping'};
	# delete $h{'Titlecase_Mapping'};
	# delete $h{'Special_Case_Condition'};
	delete $h{'Jamo_Short_Name'};

	# Store these as offsets.
	my $k;
	foreach $k ( 'Simple_Uppercase_Mapping',
	'Simple_Lowercase_Mapping',
	'Simple_Titlecase_Mapping' ) {
	if ( defined($h{$k}) ) {
	$h{$k} -= $c; # Convert to offset
	} else {
	$h{$k} = 0; # Default is zero offset
	}
	}

	my @l = sort(keys(%h));
	my $p;
	printf $fh "%05X ", $c;
	foreach $p ( @l ) {
	print $fh $p,':',$h{$p},';';
	}
	print $fh "\n";
	}
	}

	#
	# Produce the properties array
	#
	%prop_array_position = ();

	sub emit_int24($) {
	my($v) = @_;
	return sprintf("{0x%02x, 0x%02x, 0x%02x}",
	$v & 0xff,
	($v >> 8) & 0xff,
	($v >> 16) & 0xff);
	}

	sub make_properties_array()
	{
	my $fh, $fhi, $c, $prev, $mine, $cnt, $cp;

	# List of boolean properties that translate 1:1 into flags
	my @boolean_props = ('Composition_Exclusion', 'Alphabetic', 'Default_Ignorable_Code_Point',
	'Lowercase', 'Grapheme_Base', 'Grapheme_Extend', 'ID_Start', 'ID_Continue',
	'Math', 'Uppercase', 'XID_Start', 'XID_Continue', 'Hex_Digit',
	'Bidi_Control', 'Dash', 'Deprecated', 'Diacritic', 'Extender',
	'Grapheme_Link', 'Ideographic', 'IDS_Binary_Operator',
	'IDS_Trinary_Operator', 'Join_Control', 'Logical_Order_Exception',
	'Noncharacter_Code_Point', 'Pattern_Syntax', 'Pattern_White_Space',
	'Quotation_Mark', 'Radical', 'Soft_Dotted', 'STerm',
	'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector',
	'White_Space', 'Bidi_Mirrored');

	print STDERR "Writing gen/proparray.c and gen/proparrayindex\n";
	open($fh, '>', 'gen/proparray.c') or die;
	open($fhi, '>', 'gen/proparrayindex') or die;
	binmode $fh, ':utf8';

	undef $prev;
	$cnt = 0;

	for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) {
	$cp = $ucs_props{$c};
	# Careful with the formatting: we rely on the fact that
	# the first 14 characters contain the UCS value and the rest
	# the properties.

	# Code point UCS value
	$mine = sprintf("\t{\n\t\t0x%05x,\n", $c);

	# General category
	my $gc = $$cp{'General_Category'} \|\| 'Cn';
	$mine .= "\t\tUC_GC_$gc,\n";

	# Script
	my $sc = $$cp{'Script'} \|\| 'Common';
	$mine .= "\t\tUC_SC_$sc,\n";

	# Numeric value
	my $nv = $$cp{'Numeric_Value'};
	if ( $nv > 255 ) {
	my $exp = int(log($nv)/log(10))-1;
	my $num = int($nv/(10**$exp));
	$mine .= "\t\t$num, 128+$exp,\n";
	} else {
	my $num = $nv + 0;
	my $den = 1;

	if ( $nv != 0 ) {
	while ( ($nv-($num/$den))/$nv > 1e-7 ) {
	$den++;
	$num = int($nv*$den+0.5);
	}
	}
	$mine .= "\t\t$num, $den,\n";
	}

	# Boolean properties and block index
	my $bp;
	foreach $bp ( @boolean_props ) {
	if ( $$cp{$bp} ) {
	$mine .= "\t\tUC_FL_\U$bp\E \|\n";
	}
	}
	my $block = $$cp{'Block'} \|\| 'No_Block';
	$block =~ tr/ .-/___/;
	$mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n";

	# Simple case mappings
	my $sum = ($$cp{'Simple_Uppercase_Mapping'} \|\| $c) - $c;
	$mine .= "\t\t".emit_int24($sum).",\n";
	my $slm = ($$cp{'Simple_Lowercase_Mapping'} \|\| $c) - $c;
	$mine .= "\t\t".emit_int24($slm).",\n";
	my $stm = ($$cp{'Simple_Titlecase_Mapping'} \|\| $c) - $c;
	$mine .= "\t\t".emit_int24($stm).",\n";

	# Age (assume 31.7 as maximum; Unicode has traditionally not had
	# many minor versions per major version.)
	my $age = $$cp{'Age'} \|\| '0.0';
	my (@sage) = split(/\./, $age);
	$mine .= sprintf("\t\t(%d << 3) + %d, /* $age */\n", $sage[0], $sage[1]);

	# Canonical Combining Class
	my $ccc = $$cp{'Canonical_Combining_Class'} \|\| 'NR';
	if ( $ccc =~ /^[0-9]+$/ ) {
	$mine .= "\t\t$ccc,\n"; # Numeric CCC
	} else {
	$mine .= "\t\tUC_CCC_$ccc,\n";
	}

	# Sentence Break
	my $sb = $$cp{'Sentence_Break'} \|\| 'Other';
	$mine .= "\t\tUC_SB_$sb,\n";

	# Grapheme Cluster Break
	my $gcb = $$cp{'Grapheme_Cluster_Break'} \|\| 'Other';
	$mine .= "\t\tUC_GCB_$gcb,\n";

	# Word Break
	my $wb = $$cp{'Word_Break'} \|\| 'Other';
	$mine .= "\t\tUC_WB_$wb,\n";

	# Arabic Joining Type
	my $ajt = $$cp{'Joining_Type'} \|\|
	($gc eq 'Mn' \|\| $gc eq 'Me' \|\| $gc eq 'Cf') ? 'T' : 'U';
	$mine .= "\t\tUC_JT_$ajt,\n";

	# Arabic Joining Group
	my $ajg = $$cp{'Joining_Group'} \|\| 'No_Joining_Group';
	$ajg =~ tr/ /_/;
	$ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g;
	$mine .= "\t\tUC_JG_$ajg,\n";

	# East Asian Width
	my $ea = $$cp{'East_Asian_Width'} \|\| 'N';
	$mine .= "\t\tUC_EA_$ea,\n";

	# Hangul Syllable Type
	my $hst = $$cp{'Hangul_Syllable_Type'} \|\| 'NA';
	$mine .= "\t\tUC_HST_$hst,\n";

	# Line Break
	my $lb = $$cp{'Line_Break'} \|\| 'XX';
	$mine .= "\t\tUC_LB_$lb,\n";

	# Numeric Type
	my $nt = $$cp{'Numeric_Type'} \|\| 'None';
	$mine .= "\t\tUC_NT_$nt,\n";

	# Bidi Class
	my $bc = $$cp{'Bidi_Class'} \|\| 'L';
	$mine .= "\t\tUC_BC_$bc,\n";

	# Additional properties...
	$mine .= "\t},\n";

	if ( substr($prev,14) ne substr($mine,14) ) {
	print $fh $mine;
	$cnt++;
	$prev = $mine;
	printf $fhi "0x%05x $cnt\n", $c, $cnt;
	}
	$prop_array_position{$c} = $cnt;
	}
	print $fh "\t/* Total: $cnt ranges */\n";

	close($fh);
	close($fhi);
	}

	#
	# Import files
	#
	read_separated_file('ucd/UnicodeData.txt',
	['!Name', 'eGeneral_Category', 'nCanonical_Combining_Class',
	'eBidi_Class', '!Decomposition', undef, undef,
	'eNumeric_Value', 'bBidi_Mirrored',
	'mUnicode_1_Name', 'mISO_Comment', 'pSimple_Uppercase_Mapping',
	'pSimple_Lowercase_Mapping', 'pSimple_Titlecase_Mapping'],
	['<reserved>', 'Cn', 0, undef, undef, undef, undef, undef,
	'N', undef, undef, '=', '=', '=']);

	read_separated_file('ucd/extracted/DerivedNumericType.txt', ['eNumeric_Type'], []);
	read_separated_file('ucd/extracted/DerivedNumericValues.txt', ['eNumeric_Value'], []);
	read_separated_file('ucd/extracted/DerivedBidiClass.txt', ['eBidi_Class'], ['L']);
	read_separated_file('ucd/ArabicShaping.txt', [undef, 'eJoining_Type', 'eJoining_Group'], []);
	read_separated_file('ucd/BidiMirroring.txt', ['pBidi_Mirroring_Glyph'], []);
	read_separated_file('ucd/Blocks.txt', ['cBlock'], []);
	read_separated_file('ucd/CompositionExclusions.txt', 'bComposition_Exclusion', []);
	# read_separated_file('ucd/CaseFolding.txt', ['eCase_Folding_Type', 'sCase_Folding'], []);
	read_separated_file('ucd/DerivedAge.txt', ['cAge'], []);
	read_separated_file('ucd/EastAsianWidth.txt', ['eEast_Asian_Width'], []);
	read_separated_file('ucd/HangulSyllableType.txt', ['eHangul_Syllable_Type'], []);
	read_separated_file('ucd/LineBreak.txt', ['eLine_Break'], []);
	read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']);
	read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping',
	'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []);
	read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []);
	read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []);
	read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []);
	read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []);
	read_boolean_file('ucd/DerivedCoreProperties.txt');
	read_boolean_file('ucd/PropList.txt');

	#
	# Produce output
	#
	make_jamo_tables();
	make_names_list();
	make_name_keyfile();
	make_named_ucs_keyfile();
	make_properties_array();
	# dump_prop_list();