#!/usr/bin/perl # Uniset -- Unicode subset manager -- Markus Kuhn # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz # slightly modified for R to produce single-column output in # the style previously used (l/c a-f, no leading zeroes). require 5.014; use open ':utf8'; use FindBin qw($RealBin); # to find directory where this file is located binmode(STDOUT, ":utf8"); binmode(STDIN, ":utf8"); my (%name, %invname, %category, %comment); print <. yyyy yyyy (optionally prefixed with 0x) is a Unicode character belonging to the specified subset. yyyy-yyyy a range of Unicode characters belonging to yyyy..yyyy the specified subset. xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify corresponding low bytes or with a hyphen also ranges of low bytes in the Unicode values that belong to this subset. This is also the format that is generated by the compact command. End exit 1 if $#ARGV < 0; # Subroutine to identify whether the ISO 10646/Unicode character code # ucs belongs into the East Asian Wide (W) or East Asian FullWidth # (F) category as defined in Unicode Technical Report #11. sub iswide ($) { my $ucs = shift(@_); return ($ucs >= 0x1100 && ($ucs <= 0x115f || # Hangul Jamo $ucs == 0x2329 || $ucs == 0x232a || ($ucs >= 0x2e80 && $ucs <= 0xa4cf && $ucs != 0x303f) || # CJK .. Yi ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms ($ucs >= 0xffe0 && $ucs <= 0xffe6) || ($ucs >= 0x20000 && $ucs <= 0x2fffd) || ($ucs >= 0x30000 && $ucs <= 0x3fffd))); } # Return the Unicode name that belongs to a given character code # Jamo short names, see Unicode 3.0, table 4-4, page 86 my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112 my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'); # 1161..1175 my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2 sub name { my $ucs = shift(@_); # The intervals used here reflect Unicode Version 3.2 if (($ucs >= 0x3400 && $ucs <= 0x4db5) || ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs); } if ($ucs >= 0xac00 && $ucs <= 0xd7a3) { my $s = $ucs - 0xac00; my $l = 0x1100 + int($s / (21 * 28)); my $v = 0x1161 + int(($s % (21 * 28)) / 28); my $t = 0x11a7 + $s % 28; return "HANGUL SYLLABLE " . ($lname[int($s / (21 * 28))] . $vname[int(($s % (21 * 28)) / 28)] . $tname[$s % 28 - 1]); } return $name{$ucs}; } sub is_unicode { my $ucs = shift(@_); # The intervals used here reflect Unicode Version 3.2 if (($ucs >= 0x3400 && $ucs <= 0x4db5) || ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || ($ucs >= 0xac00 && $ucs <= 0xd7a3) || ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { return 1; } return exists $name{$ucs}; } my @search_path = (); if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") { push @search_path, "$1/share/uniset"; } else { push @search_path, $RealBin; } sub search_open { my ($mode, $fn) = @_; my $file; return $file if open($file, $mode, $fn); return undef if $fn =~ m|/|; for my $path (@search_path) { return $file if open($file, $mode, "$path/$fn"); } return undef; } my $html = 0; my $image = 0; my $adducs = 0; my $unicodedata = "UnicodeData.txt"; my $blockdata = "Blocks.txt"; # read list of all Unicode names my $data = search_open('<', $unicodedata); unless ($data) { die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . "Please make sure that you have downloaded the file\n" . "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"); } while (<$data>) { if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { next if $2 ne '' && substr($2, 0, 1) eq '<'; $ucs = hex($1); $name{$ucs} = $2; $invname{$2} = $ucs; $category{$ucs} = $3; $comment{$ucs} = $12; } else { die("Syntax error in line '$_' in file '$unicodedata'\n"); } } close($data); # read list of all Unicode blocks $data = search_open('<', $blockdata); unless ($data) { die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" . "Please make sure that you have downloaded the file\n" . "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n"); } my $blocks = 0; my (@blockstart, @blockend, @blockname); while (<$data>) { if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) { $blockstart[$blocks] = hex($1); $blockend [$blocks] = hex($2); $blockname [$blocks] = $3; $blocks++; } elsif (/^\s*\#/ || /^\s*$/) { # ignore comments and empty lines } else { die("Syntax error in line '$_' in file '$blockdata'\n"); } } close($data); if ($blockend[$blocks-1] < 0x110000) { $blockstart[$blocks] = 0x110000; $blockend [$blocks] = 0x7FFFFFFF; $blockname [$blocks] = "Beyond Plane 16"; $blocks++; } # process command line arguments while ($_ = shift(@ARGV)) { if (/^html$/) { $html = 1; } elsif (/^ucs$/) { $adducs = 1; } elsif (/^img$/) { $html = 1; $image = 1; } elsif (/^template$/) { $template = shift(@ARGV); open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n"); while (