blob: 4dd7f2c1c1e2a6eb0e0b94fbf47358884f184ccc [file] [log] [blame]
#!/usr/bin/perl
#
# Generate a subset of the UnicodeData.txt file, available from
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
#
# Usage:
# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
#
%need_these = ();
# Mark as needed all the characters mentioned in the relevant files
foreach $file (@ARGV) {
open(F, '<', $file) or die;
while (defined($line = <F>)) {
$line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
@f = split(/\s+/, $line);
next if (scalar @f != 2);
$need_these{hex $f[1]}++;
}
close(F);
}
# Also mark as needed any case variants of those
# (Note: this doesn't necessarily provide the full transitive closure,
# but we shouldn't need it.)
while (defined($line = <STDIN>)) {
@f = split(/;/, $line);
if ($f[0] =~ /^([0-9a-f]+)$/i) {
$r = hex $f[0];
if ($need_these{$r}) {
$need_these{hex $f[12]}++ if ($f[12] ne '');
$need_these{hex $f[13]}++ if ($f[13] ne '');
$need_these{hex $f[14]}++ if ($f[14] ne '');
}
}
}
# Finally, write out the subset
seek(STDIN, 0, 0);
while (defined($line = <STDIN>)) {
($v, $l) = split(/;/, $line, 2);
if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
# This isn't actually the format... fix that if it ever matters
$r1 = hex $1;
$r2 = hex $2;
} elsif ($v =~ /^([0-9a-f]+)$/i) {
$r1 = $r2 = hex $1;
} else {
next;
}
for ($r = $r1; $r <= $r2; $r++) {
printf "%04X;%s", $r, $l if ($need_these{$r});
}
}