simplecomp.pl - pub/scm/libs/libucd/libucd - Git at Google

 #!/usr/bin/perl
 #
 # Simple-minded compression algorithm for the Unicode names database
 #
 # We create a fixed dictionary of 255 symbols (with 0 = end of string);
 # the first 38 are the symbols space, dash, 0-9, A-Z which are the
 # characters used in Unicode names, and the remaining 217 are common
 # phrases.
 #

 use bytes;

 sub split_by_word($) {
     my ($str) = @_;
     my @l = ();
     my @s = split(/([-\s]+)/, $str);

     # Append separated whitespace to each string
     while ( scalar(@s) ) {
 	my $x = shift(@s);
 	$x .= shift(@s);
 	push(@l, $x);
     }

     return @l;
 }

 @names = ();

 # Treat these combinations as single tokens
 # This list should really be generated automatically
 @unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER',
 	      'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN',
 	      'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS',
 	      'CHEROKEE LETTER', 'VARIATION SELECTOR',
 	      'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS',
 	      'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL',
 	      'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL',
 	      'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI',
 	      'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER',
 	      'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION',
 	      'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL',
 	      'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR',
 	      'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR',
 	      'UGARITIC LETTER', 'CYPRIOT SYLLABLE'
 	      );

 while ( defined($line = <STDIN>) ) {
     chomp $line;

     $ix = hex substr($line,0,5);
     $name = substr($line,6);

     # Add a redundant space to each name; we remove this one
     # automatically during decoding
     # XXX: Try with and without this
     $name .= ' ';

     my $ut, $utx;
     foreach $ut ( @unitokens ) {
 	($utx = $ut) =~ tr/ -/_+/;
 	$name =~ s/\b$ut\b/$utx/g;
     }
     push(@names, $name);
     $name_to_ucs{$name} = $ix;
 }

 #
 # Split sets into words and count
 #
 %word_weight = ();

 foreach $n ( @names ) {
     foreach $w ( split_by_word($n) ) {
 	if ( defined($word_weight{$w}) ) {
 	    $word_weight{$w} += length($w)-1;
 	} else {
 	    $word_weight{$w} = -1; # First encounter saves nothing
 	}
     }
 }

 @commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight);

 @dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");

 $base_dict = scalar(@dictionary);
 $dict_len = 256;

 %symbol_index = ();
 @symbols = (undef) x ($dict_len);

 # Identity-map single characters
 foreach $scs ( @dictionary ) {
     $symbols[ord($scs)] = $scs;
     $symbol_index{$scs} = ord($scs);
 }
 $next_index = 0;

 while ( scalar(@dictionary) < $dict_len ) {
     push(@dictionary, shift(@commons));
 }


 $s = 0;
 for ( $i = 0 ; $i < $dict_len ; $i++ ) {
     $w = $dictionary[$i];
     printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w);
     if ( length($w) > 1 ) {
 	$s += $word_weight{$w};
 	while ( defined($symbols[$next_index]) ) {
 	    $next_index++;
 	}
 	$symbols[$next_index] = $w;
 	$symbol_index{$w} = $next_index;
 	$next_index++;
     }
 }

 # Sort dictionary in order by decreasing length
 @dictionary = sort { length($b) <=> length($a) } @dictionary;

 sub compress_string($) {
     my ($na) = @_;
     my $di, $c;

     foreach $di ( @dictionary ) {
 	die "No index for symbol: $di\n" unless (defined($symbol_index{$di}));
 	$c = chr($symbol_index{$di});
 	($rd = $di) =~ tr/_+/ -/;
 	$na =~ s/$rd/$c/g;
     }

     return $na;
 }

 $offset = 0;
 $uc_bytes = 0;

 open(NLC, '>', 'gen/nameslist.compr') or die;
 open(NLO, '>', 'gen/nameslist.offset') or die;
 foreach $n ( @names ) {
     ($na1 = $n) =~ tr/_+/ -/;
     ($na2 = $na1) =~ s/ $//;
     $true_name = $na2;		# Actually desired output

     $na1 = compress_string($na1);
     $na2 = compress_string($na2);

     $na = length($na1) < length($na2) ? $na1 : $na2;

     # Prefix byte for *uncompressed* length, then compressed data
     print  NLC chr(length($true_name)), $na;
     printf NLO "%05x %d\n", $name_to_ucs{$n}, $offset;
     $offset += length($na)+1;
     $uc_bytes += length($true_name)+1;
 }
 close(NLC);
 close(NLO);

 print "uncompressed $uc_bytes bytes, compressed $offset bytes\n";
 printf "savings %d (%.1f%%)\n", $uc_bytes-$offset, 100*(1-$offset/$uc_bytes);

 open(NLD, '>', 'gen/nameslist_dict.c') or die;
 printf NLD "const char * const _libucd_nameslist_dict[%d] = {\n", $dict_len;
 for ( $i = 0 ; $i < $dict_len ; $i++ ) {
     $sym = $symbols[$i];
     $sym =~ tr/_+/ -/;
     printf NLD "\t\"%s\",\n", $sym;
 }
 print NLD "};\n";
 close(NLD);
	#!/usr/bin/perl
	#
	# Simple-minded compression algorithm for the Unicode names database
	#
	# We create a fixed dictionary of 255 symbols (with 0 = end of string);
	# the first 38 are the symbols space, dash, 0-9, A-Z which are the
	# characters used in Unicode names, and the remaining 217 are common
	# phrases.
	#

	use bytes;

	sub split_by_word($) {
	my ($str) = @_;
	my @l = ();
	my @s = split(/([-\s]+)/, $str);

	# Append separated whitespace to each string
	while ( scalar(@s) ) {
	my $x = shift(@s);
	$x .= shift(@s);
	push(@l, $x);
	}

	return @l;
	}

	@names = ();

	# Treat these combinations as single tokens
	# This list should really be generated automatically
	@unitokens = ('LATIN SMALL LETTER', 'LATIN CAPITAL LETTER',
	'CAPITAL LETTER', 'SMALL LETTER', 'BRAILLE PATTERN',
	'BYZANTINE MUSICAL SYMBOL', 'CANADIAN SYLLABICS',
	'CHEROKEE LETTER', 'VARIATION SELECTOR',
	'APL FUNCTIONAL SYMBOL', 'BOX DRAWINGS',
	'CJK COMPATIBILITY IDEOGRAPH', 'KANGXI RADICAL',
	'LINEAR B', 'MUSICAL SYMBOL', 'ROMAN NUMERAL',
	'SANS-SERIF', 'LESS-THAN', 'GREATER-THAN', 'SYLOTI NAGRI',
	'TAI LE LETTER', 'TETRAGRAM FOR', 'THAI CHARACTER',
	'TIBETAN SUBJOINED LETTER', 'VULGAR FRACTION',
	'YI SYLLABLE', 'CJK RADICAL', 'YI RADICAL',
	'ETHIOPIC SYLLABLE', 'IDEOGRAPHIC TELEGRAPH SYMBOL FOR',
	'DOUBLE-STRUCK', 'NEW TAI LUE', 'PRESENTATION FORM FOR',
	'UGARITIC LETTER', 'CYPRIOT SYLLABLE'
	);

	while ( defined($line = <STDIN>) ) {
	chomp $line;

	$ix = hex substr($line,0,5);
	$name = substr($line,6);

	# Add a redundant space to each name; we remove this one
	# automatically during decoding
	# XXX: Try with and without this
	$name .= ' ';

	my $ut, $utx;
	foreach $ut ( @unitokens ) {
	($utx = $ut) =~ tr/ -/_+/;
	$name =~ s/\b$ut\b/$utx/g;
	}
	push(@names, $name);
	$name_to_ucs{$name} = $ix;
	}

	#
	# Split sets into words and count
	#
	%word_weight = ();

	foreach $n ( @names ) {
	foreach $w ( split_by_word($n) ) {
	if ( defined($word_weight{$w}) ) {
	$word_weight{$w} += length($w)-1;
	} else {
	$word_weight{$w} = -1; # First encounter saves nothing
	}
	}
	}

	@commons = sort { $word_weight{$b} <=> $word_weight{$a} } keys(%word_weight);

	@dictionary = split(//, " -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ");

	$base_dict = scalar(@dictionary);
	$dict_len = 256;

	%symbol_index = ();
	@symbols = (undef) x ($dict_len);

	# Identity-map single characters
	foreach $scs ( @dictionary ) {
	$symbols[ord($scs)] = $scs;
	$symbol_index{$scs} = ord($scs);
	}
	$next_index = 0;

	while ( scalar(@dictionary) < $dict_len ) {
	push(@dictionary, shift(@commons));
	}


	$s = 0;
	for ( $i = 0 ; $i < $dict_len ; $i++ ) {
	$w = $dictionary[$i];
	printf("%3d %8d \"%s\"\n", $i, $word_weight{$w}, $w);
	if ( length($w) > 1 ) {
	$s += $word_weight{$w};
	while ( defined($symbols[$next_index]) ) {
	$next_index++;
	}
	$symbols[$next_index] = $w;
	$symbol_index{$w} = $next_index;
	$next_index++;
	}
	}

	# Sort dictionary in order by decreasing length
	@dictionary = sort { length($b) <=> length($a) } @dictionary;

	sub compress_string($) {
	my ($na) = @_;
	my $di, $c;

	foreach $di ( @dictionary ) {
	die "No index for symbol: $di\n" unless (defined($symbol_index{$di}));
	$c = chr($symbol_index{$di});
	($rd = $di) =~ tr/_+/ -/;
	$na =~ s/$rd/$c/g;
	}

	return $na;
	}

	$offset = 0;
	$uc_bytes = 0;

	open(NLC, '>', 'gen/nameslist.compr') or die;
	open(NLO, '>', 'gen/nameslist.offset') or die;
	foreach $n ( @names ) {
	($na1 = $n) =~ tr/_+/ -/;
	($na2 = $na1) =~ s/ $//;
	$true_name = $na2; # Actually desired output

	$na1 = compress_string($na1);
	$na2 = compress_string($na2);

	$na = length($na1) < length($na2) ? $na1 : $na2;

	# Prefix byte for uncompressed length, then compressed data
	print NLC chr(length($true_name)), $na;
	printf NLO "%05x %d\n", $name_to_ucs{$n}, $offset;
	$offset += length($na)+1;
	$uc_bytes += length($true_name)+1;
	}
	close(NLC);
	close(NLO);

	print "uncompressed $uc_bytes bytes, compressed $offset bytes\n";
	printf "savings %d (%.1f%%)\n", $uc_bytes-$offset, 100*(1-$offset/$uc_bytes);

	open(NLD, '>', 'gen/nameslist_dict.c') or die;
	printf NLD "const char * const _libucd_nameslist_dict[%d] = {\n", $dict_len;
	for ( $i = 0 ; $i < $dict_len ; $i++ ) {
	$sym = $symbols[$i];
	$sym =~ tr/_+/ -/;
	printf NLD "\t\"%s\",\n", $sym;
	}
	print NLD "};\n";
	close(NLD);