| #!/usr/bin/perl |
| # |
| # Perl script to convert the Unicode Character Database source files |
| # into data for libucd. |
| # |
| |
| # |
| # Internally this file uses a hash with the UCS value as key, and |
| # as data another hash from property name to value. |
| # |
| |
| %ucs_props = (); |
| |
| sub parse_unicode_string($) { |
| my ($str) = @_; |
| my @str = split(/\s+/, $str, 0); |
| my @xv = (); |
| my $x; |
| |
| foreach $x ( @str ) { |
| push(@xv, hex $x); |
| } |
| |
| return pack("U*", @xv); |
| } |
| |
| # |
| # File to read a UCD file with a list of properties (no names) |
| # |
| sub read_separated_file($$$) { |
| my ($filename, $proplist, $default) = @_; |
| my $fh; |
| my $line, @fields, $c0, $c1, $c; |
| my $was_first = 0; |
| |
| print STDERR "Reading $filename\n"; |
| open($fh, '<', $filename) or return 0; |
| while ( defined($line = <$fh>) ) { |
| chomp $line; |
| $line =~ s/\s*(|\#.*)$//; |
| @fields = split(/\s*\;\s*/, $line); |
| |
| if ( $fields[0] =~ /^([0-9a-f]+)(|..([0-9a-f]+))$/i ) { |
| if ( $was_first ) { |
| $c1 = hex $1; |
| $was_first = 0; |
| } else { |
| $c0 = hex $1; |
| $c1 = ($2 eq '') ? $c0 : hex $3; |
| } |
| |
| for ( $c = $c0 ; $c <= $c1 ; $c++ ) { |
| my $p, $f; |
| |
| $ucs_props{$c} = {} unless ( exists($ucs_props{$c}) ); |
| |
| for ( $f = 1 ; $f < scalar(@fields) ; $f++ ) { |
| my $prop = ${$proplist}[$f-1]; |
| if ( defined($prop) ) { |
| my $type = substr($prop, 0, 1); |
| my $prop = substr($prop, 1); |
| my $def = ${$default}[$f-1]; |
| $def = sprintf("%04X", $c) if ( $def eq '=' ); |
| my $val = $fields[$f]; |
| $val = $def if ( $val eq '' ); |
| |
| if ( $type eq 'b' ) { |
| # Boolean (Y/N) |
| $val = ($val eq 'N') ? 0 : 1; |
| } elsif ( $type eq 'p' ) { |
| # Code point |
| $val = hex $val; |
| } elsif ( $type eq 's' ) { |
| # String of code points |
| $val = parse_unicode_string($val); |
| } elsif ( $type eq '!' ) { |
| # Special case |
| if ( $prop eq 'Name' ) { |
| # In UnicodeData.txt, ranges aren't encoded the same way |
| # as elsewhere, instead the first and last code point are |
| # entered, with <..., first> or <..., last> as the name |
| if ( $val =~ /^\<.*, First\>$/ ) { |
| $was_first = 1; |
| } |
| $val = undef if ( $val =~ /^\<.*\>$/ ); |
| } elsif ( $prop eq 'Decomposition' ) { |
| $prop = 'Decomposition_Mapping'; |
| if ( $val =~ /^(\<([a-z0-9]+)\>\s*|)([0-9a-f\s]+)$/i ) { |
| my $dct = $2 || 'canonical'; |
| $val = parse_unicode_string($3); |
| |
| ${$ucs_props{$c}}{'Decomposition_Type'} = $dct; |
| } |
| } else { |
| die "$0: Unknown special: $prop\n"; |
| } |
| } |
| |
| ${$ucs_props{$c}}{$prop} = $val; |
| } |
| } |
| } |
| } |
| } |
| close($fh); |
| |
| return 1; |
| } |
| |
| # |
| # File to read a UCD file with boolean properties |
| # |
| sub read_boolean_file($) { |
| my ($filename) = @_; |
| my $fh; |
| my $line, @fields, $c0, $c1, $c; |
| |
| print STDERR "Reading $filename\n"; |
| open($fh, '<', $filename) or return 0; |
| while ( defined($line = <$fh>) ) { |
| chomp $line; |
| $line =~ s/\s*(|\#.*)$//; |
| @fields = split(/\s*\;\s*/, $line); |
| |
| if ( $fields[0] =~ /^([0-9A-F]+)(|..([0-9A-F]+))$/ && |
| $fields[1] ne '' ) { |
| $c0 = hex $1; |
| $c1 = ($2 eq '') ? $c0 : hex $3; |
| |
| for ( $c = $c0 ; $c <= $c1 ; $c++ ) { |
| my $p, $f; |
| |
| $ucs_props{$c} = {} unless ( exists($ucs_props{$c}) ); |
| |
| ${$ucs_props{$c}}{$fields[1]} = 1; |
| } |
| } |
| } |
| close($fh); |
| |
| return 1; |
| } |
| |
| sub make_jamo_string($) { |
| my ($s) = @_; |
| my $i, $c; |
| my $o = ''; |
| |
| $o .= "{"; |
| for ( $i = 0 ; $i < 4 ; $i++ ) { |
| $c = substr($s,$i,1); |
| $o .= length($c) ? "\'$c\'" : '0'; |
| $o .= ($i == 3) ? '}' : ','; |
| } |
| |
| return $o; |
| } |
| |
| # This produces tables used to generate the systematic Hangul syllables |
| sub make_jamo_tables() { |
| my $LBase = 0x1100; |
| my $VBase = 0x1161; |
| my $TBase = 0x11A7; |
| my $LCount = 19; |
| my $VCount = 21; |
| my $TCount = 28; |
| my $i; |
| my $fh; |
| |
| # None of the syllables exceed 4 bytes, so let's not waste |
| # pointer space that might have to be relocated... |
| |
| print STDERR "Writing gen/jamo.c\n"; |
| open($fh, '>', 'gen/jamo.c') or die "$0 cannot create gen/jamo.c"; |
| print $fh "#include \"libucd_int.h\"\n\n"; |
| |
| print $fh "const char _libucd_hangul_jamo_l[$LCount][4] = {\n"; |
| for ( $i = 0 ; $i < $LCount ; $i++ ) { |
| printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$LBase+$i}}{'Jamo_Short_Name'}); |
| } |
| print $fh "};\n"; |
| print $fh "const char _libucd_hangul_jamo_v[$VCount][4] = {\n"; |
| for ( $i = 0 ; $i < $VCount ; $i++ ) { |
| printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$VBase+$i}}{'Jamo_Short_Name'}); |
| } |
| print $fh "};\n"; |
| print $fh "const char _libucd_hangul_jamo_t[$TCount][4] = {\n"; |
| for ( $i = 0 ; $i < $TCount ; $i++ ) { |
| printf $fh "\t%s,\n", make_jamo_string(${$ucs_props{$TBase+$i}}{'Jamo_Short_Name'}); |
| } |
| print $fh "};\n"; |
| |
| close($fh); |
| } |
| |
| # This produces a names list sorted by UCS, and produces a reverse map. |
| %name_to_ucs = (); |
| |
| sub make_names_list() { |
| my $k; |
| my $pos = 0; |
| my $fh; |
| my $col; |
| |
| print STDERR "Writing gen/nameslist.tab\n"; |
| open($fh, '>', 'gen/nameslist.tab') or die; |
| |
| foreach $k ( sort {$a <=> $b} (keys(%ucs_props)) ) { |
| print STDERR "Not a number: \"$k\"\n" if ( $k ne ($k+0) ); |
| |
| my $n = ${$ucs_props{$k}}{'Name'}; |
| if ( defined($n) ) { |
| if ( defined($name_to_ucs{$n}) ) { |
| printf STDERR "WARNING: Name \"%s\" duplicated from U+%04X to U+%04X\n", |
| $n, $k, $name_to_ucs{$n}; |
| } else { |
| $name_to_ucs{$n} = $k; |
| printf $fh "%05x %s\n", $k, $n; |
| } |
| } |
| } |
| close($fh); |
| } |
| |
| # |
| # Produce a list of names for automatic hash table generation. |
| # This includes the Hangul syllables, but not systematically |
| # named CJK. |
| # |
| sub write_hangul_names($$) |
| { |
| my ($fh, $fht) = @_; |
| my $SBase = 0xAC00; |
| my $LBase = 0x1100; |
| my $VBase = 0x1161; |
| my $TBase = 0x11A7; |
| my $LCount = 19; |
| my $VCount = 21; |
| my $TCount = 28; |
| my $SCount = $LCount*$VCount*$TCount; |
| my $l, $v, $t, $c; |
| |
| $c = $SBase; |
| for ( $l = 0 ; $l < $LCount ; $l++ ) { |
| for ( $v = 0 ; $v < $VCount ; $v++ ) { |
| for ( $t = 0 ; $t < $TCount ; $t++) { |
| my $name = sprintf("HANGUL SYLLABLE %s%s%s", |
| ${$ucs_props{$LBase+$l}}{'Jamo_Short_Name'}, |
| ${$ucs_props{$VBase+$v}}{'Jamo_Short_Name'}, |
| ${$ucs_props{$TBase+$t}}{'Jamo_Short_Name'}); |
| printf $fh "%s\n", $name; |
| printf $fht "%05x %s\n", $c, $name; |
| |
| $c++; |
| } |
| } |
| } |
| } |
| |
| sub make_name_keyfile() |
| { |
| my $fh, $fht; |
| my $k; |
| |
| print STDERR "Writing gen/nametoucs.keys and gen/nametoucs.tab\n"; |
| open($fh, '>', 'gen/nametoucs.keys') or die; |
| open($fht, '>', 'gen/nametoucs.tab') or die; |
| |
| foreach $k ( keys(%name_to_ucs) ) { |
| printf $fh "%s\n", $k; |
| printf $fht "%05x %s\n", $name_to_ucs{$k}, $k; |
| } |
| |
| write_hangul_names($fh, $fht); |
| |
| close($fh); |
| close($fht); |
| } |
| |
| # |
| # Make a keyfile for all non-systematically named codepoints |
| # |
| sub make_named_ucs_keyfile() |
| { |
| my $fh; |
| my $k; |
| |
| print STDERR "Writing gen/ucstoname.keys\n"; |
| open($fh, '>', 'gen/ucstoname.keys') |
| or die "$0: cannot write gen/ucstoname.keys\n"; |
| |
| foreach $k ( values(%name_to_ucs) ) { |
| printf $fh "%08x\n", $k; |
| } |
| |
| close($fh); |
| } |
| |
| # |
| # Produce a list of character properties, sans names; this is |
| # a test in order to figure out how much we could save from a |
| # range-oriented table for everything except names. |
| # |
| sub dump_prop_list() |
| { |
| my $fh, $c; |
| |
| print STDERR "Writing gen/propdump.txt\n"; |
| open($fh, '>', 'gen/propdump.txt') |
| or die "$0: cannot write gen/propdump.txt\n"; |
| binmode $fh, ':utf8'; |
| |
| for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) { |
| my %h = %{$ucs_props{$c}}; |
| |
| # Handle these separately |
| delete $h{'Name'}; |
| delete $h{'Unicode_1_Name'}; |
| delete $h{'ISO_Comment'}; |
| delete $h{'Decomposition_Mapping'}; |
| # delete $h{'Uppercase_Mapping'}; |
| # delete $h{'Lowercase_Mapping'}; |
| # delete $h{'Titlecase_Mapping'}; |
| # delete $h{'Special_Case_Condition'}; |
| delete $h{'Jamo_Short_Name'}; |
| |
| # Store these as offsets. |
| my $k; |
| foreach $k ( 'Simple_Uppercase_Mapping', |
| 'Simple_Lowercase_Mapping', |
| 'Simple_Titlecase_Mapping' ) { |
| if ( defined($h{$k}) ) { |
| $h{$k} -= $c; # Convert to offset |
| } else { |
| $h{$k} = 0; # Default is zero offset |
| } |
| } |
| |
| my @l = sort(keys(%h)); |
| my $p; |
| printf $fh "%05X ", $c; |
| foreach $p ( @l ) { |
| print $fh $p,':',$h{$p},';'; |
| } |
| print $fh "\n"; |
| } |
| } |
| |
| # |
| # Produce the properties array |
| # |
| %prop_array_position = (); |
| |
| sub emit_int24($) { |
| my($v) = @_; |
| return sprintf("{0x%02x, 0x%02x, 0x%02x}", |
| $v & 0xff, |
| ($v >> 8) & 0xff, |
| ($v >> 16) & 0xff); |
| } |
| |
| sub make_properties_array() |
| { |
| my $fh, $fhi, $c, $prev, $mine, $cnt, $cp; |
| |
| # List of boolean properties that translate 1:1 into flags |
| my @boolean_props = ('Composition_Exclusion', 'Alphabetic', 'Default_Ignorable_Code_Point', |
| 'Lowercase', 'Grapheme_Base', 'Grapheme_Extend', 'ID_Start', 'ID_Continue', |
| 'Math', 'Uppercase', 'XID_Start', 'XID_Continue', 'Hex_Digit', |
| 'Bidi_Control', 'Dash', 'Deprecated', 'Diacritic', 'Extender', |
| 'Grapheme_Link', 'Ideographic', 'IDS_Binary_Operator', |
| 'IDS_Trinary_Operator', 'Join_Control', 'Logical_Order_Exception', |
| 'Noncharacter_Code_Point', 'Pattern_Syntax', 'Pattern_White_Space', |
| 'Quotation_Mark', 'Radical', 'Soft_Dotted', 'STerm', |
| 'Terminal_Punctuation', 'Unified_Ideograph', 'Variation_Selector', |
| 'White_Space', 'Bidi_Mirrored'); |
| |
| print STDERR "Writing gen/proparray.c and gen/proparrayindex\n"; |
| open($fh, '>', 'gen/proparray.c') or die; |
| open($fhi, '>', 'gen/proparrayindex') or die; |
| binmode $fh, ':utf8'; |
| |
| undef $prev; |
| $cnt = 0; |
| |
| for ( $c = 0 ; $c <= 0x10ffff ; $c++ ) { |
| $cp = $ucs_props{$c}; |
| # Careful with the formatting: we rely on the fact that |
| # the first 14 characters contain the UCS value and the rest |
| # the properties. |
| |
| # Code point UCS value |
| $mine = sprintf("\t{\n\t\t0x%05x,\n", $c); |
| |
| # General category |
| my $gc = $$cp{'General_Category'} || 'Cn'; |
| $mine .= "\t\tUC_GC_$gc,\n"; |
| |
| # Script |
| my $sc = $$cp{'Script'} || 'Common'; |
| $mine .= "\t\tUC_SC_$sc,\n"; |
| |
| # Numeric value |
| my $nv = $$cp{'Numeric_Value'}; |
| if ( $nv > 255 ) { |
| my $exp = int(log($nv)/log(10))-1; |
| my $num = int($nv/(10**$exp)); |
| $mine .= "\t\t$num, 128+$exp,\n"; |
| } else { |
| my $num = $nv + 0; |
| my $den = 1; |
| |
| if ( $nv != 0 ) { |
| while ( ($nv-($num/$den))/$nv > 1e-7 ) { |
| $den++; |
| $num = int($nv*$den+0.5); |
| } |
| } |
| $mine .= "\t\t$num, $den,\n"; |
| } |
| |
| # Boolean properties and block index |
| my $bp; |
| foreach $bp ( @boolean_props ) { |
| if ( $$cp{$bp} ) { |
| $mine .= "\t\tUC_FL_\U$bp\E |\n"; |
| } |
| } |
| my $block = $$cp{'Block'} || 'No_Block'; |
| $block =~ tr/ .-/___/; |
| $mine .= "\t\t((uint64_t)UC_BLK_$block << 48),\n"; |
| |
| # Simple case mappings |
| my $sum = ($$cp{'Simple_Uppercase_Mapping'} || $c) - $c; |
| $mine .= "\t\t".emit_int24($sum).",\n"; |
| my $slm = ($$cp{'Simple_Lowercase_Mapping'} || $c) - $c; |
| $mine .= "\t\t".emit_int24($slm).",\n"; |
| my $stm = ($$cp{'Simple_Titlecase_Mapping'} || $c) - $c; |
| $mine .= "\t\t".emit_int24($stm).",\n"; |
| |
| # Age (assume 31.7 as maximum; Unicode has traditionally not had |
| # many minor versions per major version.) |
| my $age = $$cp{'Age'} || '0.0'; |
| my (@sage) = split(/\./, $age); |
| $mine .= sprintf("\t\t(%d << 3) + %d, /* $age */\n", $sage[0], $sage[1]); |
| |
| # Canonical Combining Class |
| my $ccc = $$cp{'Canonical_Combining_Class'} || 'NR'; |
| if ( $ccc =~ /^[0-9]+$/ ) { |
| $mine .= "\t\t$ccc,\n"; # Numeric CCC |
| } else { |
| $mine .= "\t\tUC_CCC_$ccc,\n"; |
| } |
| |
| # Sentence Break |
| my $sb = $$cp{'Sentence_Break'} || 'Other'; |
| $mine .= "\t\tUC_SB_$sb,\n"; |
| |
| # Grapheme Cluster Break |
| my $gcb = $$cp{'Grapheme_Cluster_Break'} || 'Other'; |
| $mine .= "\t\tUC_GCB_$gcb,\n"; |
| |
| # Word Break |
| my $wb = $$cp{'Word_Break'} || 'Other'; |
| $mine .= "\t\tUC_WB_$wb,\n"; |
| |
| # Arabic Joining Type |
| my $ajt = $$cp{'Joining_Type'} || |
| ($gc eq 'Mn' || $gc eq 'Me' || $gc eq 'Cf') ? 'T' : 'U'; |
| $mine .= "\t\tUC_JT_$ajt,\n"; |
| |
| # Arabic Joining Group |
| my $ajg = $$cp{'Joining_Group'} || 'No_Joining_Group'; |
| $ajg =~ tr/ /_/; |
| $ajg =~ s/([A-Z])([A-Z]+)/$1\L$2\E/g; |
| $mine .= "\t\tUC_JG_$ajg,\n"; |
| |
| # East Asian Width |
| my $ea = $$cp{'East_Asian_Width'} || 'N'; |
| $mine .= "\t\tUC_EA_$ea,\n"; |
| |
| # Hangul Syllable Type |
| my $hst = $$cp{'Hangul_Syllable_Type'} || 'NA'; |
| $mine .= "\t\tUC_HST_$hst,\n"; |
| |
| # Line Break |
| my $lb = $$cp{'Line_Break'} || 'XX'; |
| $mine .= "\t\tUC_LB_$lb,\n"; |
| |
| # Numeric Type |
| my $nt = $$cp{'Numeric_Type'} || 'None'; |
| $mine .= "\t\tUC_NT_$nt,\n"; |
| |
| # Bidi Class |
| my $bc = $$cp{'Bidi_Class'} || 'L'; |
| $mine .= "\t\tUC_BC_$bc,\n"; |
| |
| # Additional properties... |
| $mine .= "\t},\n"; |
| |
| if ( substr($prev,14) ne substr($mine,14) ) { |
| print $fh $mine; |
| $cnt++; |
| $prev = $mine; |
| printf $fhi "0x%05x $cnt\n", $c, $cnt; |
| } |
| $prop_array_position{$c} = $cnt; |
| } |
| print $fh "\t/* Total: $cnt ranges */\n"; |
| |
| close($fh); |
| close($fhi); |
| } |
| |
| # |
| # Import files |
| # |
| read_separated_file('ucd/UnicodeData.txt', |
| ['!Name', 'eGeneral_Category', 'nCanonical_Combining_Class', |
| 'eBidi_Class', '!Decomposition', undef, undef, |
| 'eNumeric_Value', 'bBidi_Mirrored', |
| 'mUnicode_1_Name', 'mISO_Comment', 'pSimple_Uppercase_Mapping', |
| 'pSimple_Lowercase_Mapping', 'pSimple_Titlecase_Mapping'], |
| ['<reserved>', 'Cn', 0, undef, undef, undef, undef, undef, |
| 'N', undef, undef, '=', '=', '=']); |
| |
| read_separated_file('ucd/extracted/DerivedNumericType.txt', ['eNumeric_Type'], []); |
| read_separated_file('ucd/extracted/DerivedNumericValues.txt', ['eNumeric_Value'], []); |
| read_separated_file('ucd/extracted/DerivedBidiClass.txt', ['eBidi_Class'], ['L']); |
| read_separated_file('ucd/ArabicShaping.txt', [undef, 'eJoining_Type', 'eJoining_Group'], []); |
| read_separated_file('ucd/BidiMirroring.txt', ['pBidi_Mirroring_Glyph'], []); |
| read_separated_file('ucd/Blocks.txt', ['cBlock'], []); |
| read_separated_file('ucd/CompositionExclusions.txt', 'bComposition_Exclusion', []); |
| # read_separated_file('ucd/CaseFolding.txt', ['eCase_Folding_Type', 'sCase_Folding'], []); |
| read_separated_file('ucd/DerivedAge.txt', ['cAge'], []); |
| read_separated_file('ucd/EastAsianWidth.txt', ['eEast_Asian_Width'], []); |
| read_separated_file('ucd/HangulSyllableType.txt', ['eHangul_Syllable_Type'], []); |
| read_separated_file('ucd/LineBreak.txt', ['eLine_Break'], []); |
| read_separated_file('ucd/Scripts.txt', ['cScript'], ['Common']); |
| read_separated_file('ucd/SpecialCasing.txt', ['sUppercase_Mapping', 'sLowercase_Mapping', |
| 'sTitlecase_Mapping', 'mSpecial_Case_Condition'], []); |
| read_separated_file('ucd/Jamo.txt', ['mJamo_Short_Name'], []); |
| read_separated_file('ucd/auxilliary/GraphemeBreakProperty.txt', ['eGrapheme_Cluster_Break'], []); |
| read_separated_file('ucd/auxilliary/SentenceBreakProperty.txt', ['eSentence_Break'], []); |
| read_separated_file('ucd/auxilliary/WordBreakProperty.txt', ['eWord_Break'], []); |
| read_boolean_file('ucd/DerivedCoreProperties.txt'); |
| read_boolean_file('ucd/PropList.txt'); |
| |
| # |
| # Produce output |
| # |
| make_jamo_tables(); |
| make_names_list(); |
| make_name_keyfile(); |
| make_named_ucs_keyfile(); |
| make_properties_array(); |
| # dump_prop_list(); |