dslinux/user/perl/ext/Encode/bin enc2xs piconv ucm2table ucmlint ucmsort unidump

Tue Dec 5 05:26:37 CET 2006

Update of /cvsroot/dslinux/dslinux/user/perl/ext/Encode/bin
In directory antilope:/tmp/cvs-serv7729/ext/Encode/bin

Added Files:
	enc2xs piconv ucm2table ucmlint ucmsort unidump 
Log Message:
Adding fresh perl source to HEAD to branch from

--- NEW FILE: ucmsort ---
#!/usr/local/bin/perl
#
# $Id: ucmsort,v 1.1 2006-12-05 04:26:35 dslinux_cayenne Exp $
#
use strict;
my @lines;
my ($head, $tail);
while (<>){
    unless (m/^<U/o){
        unless(@lines){
	    $head .= $_;
	}else{ 
	    $tail .= $_;
	}
	next;
    }
    chomp;
    my @words = split;
    my $u = shift @words;
    $u =~ s/^<U//o; $u =~ s/>.*//o;
    push @lines,[ $u, @words ];
}

print $head;
for (sort {
    hex($a->[0]) <=> hex($b->[0]) # Unicode descending order
	or $a->[2] cmp $b->[2] # fallback descending order
	or $a->[1] cmp $b->[1] # Encoding descending order
    }
     @lines) {
    my $u = shift @$_;
    print join(" " => "<U$u>", @$_), "\n";
}
print $tail;
__END__

--- NEW FILE: ucm2table ---
#!/usr/bin/perl
# $Id: ucm2table,v 1.1 2006-12-05 04:26:35 dslinux_cayenne Exp $
#

use 5.006;
use strict;
use Getopt::Std;
my %Opt;
getopts("aeu", \%Opt);
my %Chartab;

my $Hex = '[0-9A-Fa-f]';
while(<>){
	chomp;
	my ($uni, $enc, $fb) = 
	    /^<U($Hex+)>\s+(\S+)\s+\|(\d)/o or next;
	$fb eq '0' or next;
	my @byte = ();
	my $ord = 0;
	while($enc =~ /\G\\x($Hex+)/iog){
	    my $byte = hex($1);
	    push @byte, $byte;
	    $ord <<= 8; $ord += $byte;
	};
	# print join('', @byte), " => $ord \n";
	if ($Opt{u}){
	    $Chartab{$ord} = pack("U", hex($uni));
	}else{
	    $Chartab{$ord} = pack("C*", @byte);
	}
}

my $start = $Opt{a} ? 0x20 : 0xa0;

for (my $x = $start; $x <= 0xffff; $x += 32) {
    my $line =  '';
    for my $i (0..31){
	my $num = $x+$i; $num eq 0x7f and next; # skip delete
	my $char = $Chartab{$num};
	$line .= !$char ? " " : 
	    ($num < 0x7f ) ? " $char" : $char ;
    }
    $line =~ /^\s+$/o and next;
    printf "0x%04x: $line\n", $x;
}

--- NEW FILE: piconv ---
#!./perl
# $Id: piconv,v 1.1 2006-12-05 04:26:35 dslinux_cayenne Exp $
#
use 5.8.0;
use strict;
use Encode ;
use Encode::Alias;
my %Scheme =  map {$_ => 1} qw(from_to decode_encode perlio);

use File::Basename;
my $name = basename($0);

use Getopt::Long qw(:config no_ignore_case);

my %Opt;

help()
    unless
      GetOptions(\%Opt,
		 'from|f=s',
		 'to|t=s',
		 'list|l',
		 'string|s=s',
		 'check|C=i',
		 'c',
		 'perlqq|p',
		 'debug|D',
		 'scheme|S=s',
		 'resolve|r=s',
		 'help',
		 );

$Opt{help} and help();
$Opt{list} and list_encodings();
my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG};
defined $Opt{resolve} and resolve_encoding($Opt{resolve});
$Opt{from} || $Opt{to} || help();
my $from = $Opt{from} || $locale or help("from_encoding unspecified");
my $to   = $Opt{to}   || $locale or help("to_encoding unspecified");
$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit;
my $scheme = exists $Scheme{$Opt{Scheme}} ? $Opt{Scheme} :  'from_to';
$Opt{check} ||= $Opt{c};
$Opt{perlqq} and $Opt{check} = Encode::FB_PERLQQ;

if ($Opt{debug}){
    my $cfrom = Encode->getEncoding($from)->name;
    my $cto   = Encode->getEncoding($to)->name;
    print <<"EOT";
Scheme: $scheme
From:   $from => $cfrom
To:     $to => $cto
EOT
}

# we do not use <> (or ARGV) for the sake of binmode()
@ARGV or push @ARGV, \*STDIN; 

unless ($scheme eq 'perlio'){
    binmode STDOUT;
    for my $argv (@ARGV){
	my $ifh = ref $argv ? $argv : undef;
	$ifh or open $ifh, "<", $argv or next;
	binmode $ifh;
	if ($scheme eq 'from_to'){ 	    # default
	    while(<$ifh>){
		Encode::from_to($_, $from, $to, $Opt{check}); 
		print;
	    }
	}elsif ($scheme eq 'decode_encode'){ # step-by-step
	    while(<$ifh>){
		my $decoded = decode($from, $_, $Opt{check});
		my $encoded = encode($to, $decoded);
		print $encoded;
	    }
	} else { # won't reach
	    die "$name: unknown scheme: $scheme";
	}
    }
}else{
    # NI-S favorite
    binmode STDOUT => "raw:encoding($to)";
    for my $argv (@ARGV){
	my $ifh = ref $argv ? $argv : undef;
	$ifh or open $ifh, "<", $argv or next;
	binmode $ifh => "raw:encoding($from)";
	print while(<$ifh>);
    }
}

sub list_encodings{
    print join("\n", Encode->encodings(":all")), "\n";
    exit 0;
}

sub resolve_encoding {
    if (my $alias = Encode::resolve_alias($_[0])) {
	print $alias, "\n";
	exit 0;
    } else {
	warn "$name: $_[0] is not known to Encode\n";
	exit 1;
    }
}

sub help{
    my $message = shift;
    $message and print STDERR "$name error: $message\n";
    print STDERR <<"EOT";
$name [-f from_encoding] [-t to_encoding] [-s string] [files...]
$name -l
$name -r encoding_alias
  -l,--list
     lists all available encodings
  -r,--resolve encoding_alias
    resolve encoding to its (Encode) canonical name
  -f,--from from_encoding  
     when omitted, the current locale will be used
  -t,--to to_encoding    
     when omitted, the current locale will be used
  -s,--string string         
     "string" will be the input instead of STDIN or files
The following are mainly of interest to Encode hackers:
  -D,--debug          show debug information
  -C N | -c | -p      check the validity of the input
  -S,--scheme scheme  use the scheme for conversion
EOT
  exit;
}

__END__

=head1 NAME

piconv -- iconv(1), reinvented in perl

=head1 SYNOPSIS

  piconv [-f from_encoding] [-t to_encoding] [-s string] [files...]
  piconv -l
  piconv [-C N|-c|-p]
  piconv -S scheme ...
  piconv -r encoding
  piconv -D ...
  piconv -h

=head1 DESCRIPTION

B<piconv> is perl version of B<iconv>, a character encoding converter
widely available for various Unixen today.  This script was primarily
a technology demonstrator for Perl 5.8.0, but you can use piconv in the
place of iconv for virtually any case.

piconv converts the character encoding of either STDIN or files
specified in the argument and prints out to STDOUT.

Here is the list of options.  Each option can be in short format (-f)
or long (--from).

=over 4

=item -f,--from from_encoding

Specifies the encoding you are converting from.  Unlike B<iconv>,
this option can be omitted.  In such cases, the current locale is used.

=item -t,--to to_encoding

Specifies the encoding you are converting to.  Unlike B<iconv>,
this option can be omitted.  In such cases, the current locale is used.

Therefore, when both -f and -t are omitted, B<piconv> just acts
like B<cat>.

=item -s,--string I<string>

uses I<string> instead of file for the source of text.

=item -l,--list

Lists all available encodings, one per line, in case-insensitive
order.  Note that only the canonical names are listed; many aliases
exist.  For example, the names are case-insensitive, and many standard
and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850"
instead of "cp850", or "winlatin1" for "cp1252".  See L<Encode::Supported>
for a full discussion.

=item -C,--check I<N>

Check the validity of the stream if I<N> = 1.  When I<N> = -1, something
interesting happens when it encounters an invalid character.

=item -c

Same as C<-C 1>.

=item -p,--perlqq

Same as C<-C -1>.

=item -h,--help

Show usage.

=item -D,--debug

Invokes debugging mode.  Primarily for Encode hackers.

=item -S,--scheme scheme

Selects which scheme is to be used for conversion.  Available schemes
are as follows:

=over 4

=item from_to

Uses Encode::from_to for conversion.  This is the default.

=item decode_encode

Input strings are decode()d then encode()d.  A straight two-step
implementation.

=item perlio

The new perlIO layer is used.  NI-S' favorite.

=back

Like the I<-D> option, this is also for Encode hackers.

=back

=head1 SEE ALSO

L<iconv/1>
L<locale/3>
L<Encode>
L<Encode::Supported>
L<Encode::Alias>
L<PerlIO>

=cut

--- NEW FILE: unidump ---
#!./perl

use strict;
use Encode;
use Getopt::Std;
my %Opt; getopts("ChH:e:f:t:s:pPv", \%Opt);
$Opt{p} ||= $Opt{P};
$Opt{e} ||= 'utf8';
$Opt{f} ||= $Opt{e};
$Opt{t} ||= $Opt{e};
$Opt{h} and help();

my ($linebuf, $outbuf);
my $CPL = $Opt{p} ? 64 : 8;
my $linenum;
my $linesperheading = $Opt{H};
my $nchars;
our $PrevChunk;

$Opt{h} and help();
$Opt{p} and do_perl($Opt{s});
do_dump($Opt{s});
exit;

#

sub do_perl{
    my $string = shift;
    $Opt{P} and print "#!$^X -w\nprint\n";
    unless ($string){
	while(<>){
	    use utf8;
	    $linebuf .=  Encode::decode($Opt{f}, $_);
	    while($linebuf){
		my $chr =  render_p(substr($linebuf, 0, 1, ''));
		length($outbuf) + length($chr) > $CPL and print_P();
		$outbuf .= $chr;
	    }
	}
	$outbuf and print print_P(";");
    }else{
	while($string){
	    my $chr =  render_p(substr($string, 0, 1, ''));
	    length($outbuf) + length($chr) > $CPL and print_P();
	    $outbuf .= $chr;
	}
    }
    $outbuf and print print_P(";");
    exit;
}

sub render_p{
    my ($chr, $format) = @_;
    our %S2pstr;
    $S2pstr{$chr} and return $S2pstr{$chr}; # \t\n...
    $chr =~ /[\x20-\x7e]/ and return $chr;  # ascii, printable;
    my $fmt = ($chr =~ /[\x00-\x1f\x7F]/)  ?
	q(\x%x) : q(\x{%x});
    return sprintf $fmt, ord($chr);
}

sub print_P{
    my $end = shift;
    $outbuf or return;
    print '"', encode($Opt{t}, $outbuf), '"';
    my $tail = $Opt{P} ? $end ? "$end" :  "," : '';
    print $tail, "\n";
    $outbuf = '';
}

sub do_dump{
    my $string = shift;
    !$Opt{p} and exists $Opt{H} and print_H();
    unless ($string){
	while(<>){
	    use utf8;
	    $linebuf .=  Encode::decode($Opt{f}, $_);
	    while (length($linebuf) > $CPL){
		my $chunk = substr($linebuf, 0, $CPL, '');
		print_C($chunk, $linenum++);
		$Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S();
	    }
	}
	$linebuf and print_C($linebuf);
    }else{
	while ($string){
	    my $chunk = substr($string, 0, $CPL, '');
	    print_C($chunk, $linenum++);
	    $Opt{H} and $linenum % $Opt{H} == $CPL-1 and print_S();
	}
    }
    exit;
}

sub print_S{
    print "--------+------------------------------------------------";
    if ($Opt{C}){
	print "-+-----------------";
    }
    print "\n";
}
sub print_H{
    print "  Offset      0     1     2     3     4     5     6     7";
    if ($Opt{C}){
	print " |  0 1 2 3 4 5 6 7";
    }
    print "\n";
    print_S;
}

sub print_C{
    my ($chunk, $linenum) = @_;
    if (!$Opt{v} and $chunk eq $PrevChunk){
	printf "%08x *\n", $linenum*8; return;
    }
    $PrevChunk = $chunk;
    my $end = length($chunk) - 1;
    my (@ord, @chr);
    for my $i (0..$end){
	use utf8;
	my $chr = substr($chunk,$i,1);
	my $ord = ord($chr);
	my $fmt = $ord <= 0xffff ? "  %04x" : " %05x";
	push @ord, (sprintf $fmt, $ord);
	$Opt{C} and push @chr, render_c($chr);
    }
    if (++$end < 7){
	for my $i ($end..7){
	    push @ord, (" " x 6);
	}
    }
    my $line = sprintf "%08x %s", $linenum*8, join('', @ord);
    $Opt{C} and $line .= sprintf " | %s",  join('', @chr);
    print encode($Opt{t}, $line), "\n";
}

sub render_c{
    my ($chr, $format) = @_;
    our (%S2str, $IsFullWidth);
    $chr =~ /[\p{IsControl}\s]/o and return $S2str{$chr} || "  ";
    $chr =~ $IsFullWidth and return $chr; # as is
    return " " . $chr;
}

sub help{
    my $message = shift;
    use File::Basename;
    my $name = basename($0);
    $message and print STDERR "$name error: $message\n";
    print STDERR <<"EOT";
Usage:
  $name -[options...] [files...]
  $name -[options...] -s "string"
  $name -h
  -h prints this message.
Inherited from hexdump;
  -C Canonical unidump mode
  -v prints the duplicate line as is.  Without this option,
     single "*" will be printed instead.
For unidump only
  -p prints in perl literals that you can copy and paste directly
     to your perl script.
  -P prints in perl executable format!
  -u prints a bunch of "Uxxxx,".  Handy when you want to pass your
     characters in mailing lists. 
IO Options:
  -e io_encoding    same as "-f io_encoding -t io_encoding"
  -f from_encoding  convert the source stream from this encoding
  -t to_encoding    print to STDOUT in this encoding
  -s string         "string" will be converted instead of STDIN.
  -H nline          prints separater for each nlines of output.
                    0 means only the table headding be printed.
EOT
  exit;
}

BEGIN{
    our %S2pstr= (
		  "\\" => '\\\\',
		  "\0" => '\0',
		  "\t" => '\t',
		  "\n" => '\n',
		  "\r" => '\r',
		  "\v" => '\v',
		  "\a" => '\a',
		  "\e" => '\e',
		  "\"" => qq(\\\"),
		  "\'" => qq(\\\'),
		  '$'  => '\$',
		  "@"  => '\@',
		  "%"  => '\%',
		 );

    our %S2str = (
		  qq(\x00) => q(\0),  # NULL
		  qq(\x01) => q(^A),  # START OF HEADING
		  qq(\x02) => q(^B),  # START OF TEXT
		  qq(\x03) => q(^C),  # END OF TEXT
		  qq(\x04) => q(^D),  # END OF TRANSMISSION
		  qq(\x05) => q(^E),  # ENQUIRY
		  qq(\x06) => q(^F),  # ACKNOWLEDGE
		  qq(\x07) => q(\a),  # BELL
		  qq(\x08) => q(^H),  # BACKSPACE
		  qq(\x09) => q(\t),  # HORIZONTAL TABULATION
		  qq(\x0A) => q(\n),  # LINE FEED
		  qq(\x0B) => q(\v),  # VERTICAL TABULATION
		  qq(\x0C) => q(^L),  # FORM FEED
		  qq(\x0D) => q(\r),  # CARRIAGE RETURN
		  qq(\x0E) => q(^N),  # SHIFT OUT
		  qq(\x0F) => q(^O),  # SHIFT IN
		  qq(\x10) => q(^P),  # DATA LINK ESCAPE
		  qq(\x11) => q(^Q),  # DEVICE CONTROL ONE
		  qq(\x12) => q(^R),  # DEVICE CONTROL TWO
		  qq(\x13) => q(^S),  # DEVICE CONTROL THREE
		  qq(\x14) => q(^T),  # DEVICE CONTROL FOUR
		  qq(\x15) => q(^U),  # NEGATIVE ACKNOWLEDGE
		  qq(\x16) => q(^V),  # SYNCHRONOUS IDLE
		  qq(\x17) => q(^W),  # END OF TRANSMISSION BLOCK
		  qq(\x18) => q(^X),  # CANCEL
		  qq(\x19) => q(^Y),  # END OF MEDIUM
		  qq(\x1A) => q(^Z),  # SUBSTITUTE
		  qq(\x1B) => q(\e),  # ESCAPE (\c[)
		  qq(\x1C) => "^\\",  # FILE SEPARATOR
		  qq(\x1D) => "^\]",  # GROUP SEPARATOR
		  qq(\x1E) => q(^^),  # RECORD SEPARATOR
		  qq(\x1F) => q(^_),  # UNIT SEPARATOR
		  );
    #
    # Generated out of lib/unicore/EastAsianWidth.txt 
    # will it work ?
    #		  
    our $IsFullWidth = 
	qr/^[
	     \x{1100}-\x{1159}
	     \x{115F}-\x{115F}
	     \x{2329}-\x{232A}
	     \x{2E80}-\x{2E99}
	     \x{2E9B}-\x{2EF3}
	     \x{2F00}-\x{2FD5}
	     \x{2FF0}-\x{2FFB}
	     \x{3000}-\x{303E}
	     \x{3041}-\x{3096}
	     \x{3099}-\x{30FF}
	     \x{3105}-\x{312C}
	     \x{3131}-\x{318E}
	     \x{3190}-\x{31B7}
	     \x{31F0}-\x{321C}
	     \x{3220}-\x{3243}
	     \x{3251}-\x{327B}
	     \x{327F}-\x{32CB}
	     \x{32D0}-\x{32FE}
	     \x{3300}-\x{3376}
	     \x{337B}-\x{33DD}
	     \x{3400}-\x{4DB5}
	     \x{4E00}-\x{9FA5}
	     \x{33E0}-\x{33FE}
	     \x{A000}-\x{A48C}
	     \x{AC00}-\x{D7A3}
	     \x{A490}-\x{A4C6}
	     \x{F900}-\x{FA2D}
	     \x{FA30}-\x{FA6A}
	     \x{FE30}-\x{FE46}
	     \x{FE49}-\x{FE52}
	     \x{FE54}-\x{FE66}
	     \x{FE68}-\x{FE6B}
	     \x{FF01}-\x{FF60}
	     \x{FFE0}-\x{FFE6}
	     \x{20000}-\x{2A6D6}
	 ]$/xo;
}

__END__

--- NEW FILE: ucmlint ---
#!/usr/local/bin/perl
#
# $Id: ucmlint,v 1.1 2006-12-05 04:26:35 dslinux_cayenne Exp $
#

use strict;
our  $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Getopt::Std;
our %Opt;
getopts("Dehfv", \%Opt);

if ($Opt{e}){
   eval{ require Encode; };
   $@ and die "can't load Encode : $@";
}

$Opt{h} and help();
@ARGV or help();

sub help{
    print <<"";
$0 -[Dehfv] [ucm files ...]
  -D debug mode on
  -e test with Encode module also (requires perl 5.7.3 or higher)
  -h shows this message
  -f forces roundtrip check even for |[123]
  -v verbose mode

}

$| = 1;
my (%Hdr, %U2E, %E2U);
my $in_charmap = 0;
my $nerror = 0;
my $nwarning = 0;

sub nit($;$){
    my ($msg, $level) = @_;
    my $lstr;
    if ($level == 2){
	$lstr = 'notice';
    }elsif ($level == 1){
	$lstr = 'warning'; $nwarning++;
    }else{
	$lstr = 'error'; $nerror++;
    }
    print "$ARGV:$lstr in line $.: $msg\n";
}

for $ARGV (@ARGV){
    open UCM, $ARGV or die "$ARGV:$!";
    %Hdr = %U2E = %E2U = ();
    $in_charmap = $nerror = $nwarning = 0;
    $. = 0;
    while(<UCM>){
	chomp;
	s/\s*#.*$//o; /^$/ and next;
	if ($_ eq "CHARMAP"){ 
	    $in_charmap = 1;
	    for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
		exists $Hdr{$must} or nit "<$must> nonexistent";
	    }
	    $Hdr{mb_cur_min} > $Hdr{mb_cur_max}
	    and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
			    $Hdr{mb_cur_min},$Hdr{mb_cur_max});
	    $in_charmap = 1;
	    next;
	}
	unless ($in_charmap){
	    my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
	    $Opt{D} and warn "$hkey => $hvalue";
	    if ($hkey eq "code_set_name"){ # name check
		exists $Hdr{code_set_name} 
		and nit "Duplicate <code_set_name>: $hkey";
	    }
	    if ($hkey eq "code_set_alias"){ # alias check
		$hvalue eq $Hdr{code_set_name}
		and nit qq(alias "$hvalue" is already in <code_set_name>);
	    }
	    $Hdr{$hkey} = $hvalue;
	}else{
	    my $name = $Hdr{code_set_name};
	    my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
	    $Opt{v} and nit $_, 2;
	    my $uni = uniparse($unistr);
	    my $enc = encparse($encstr);
	    $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
	    $fb = $1; 
	    $Opt{f} and $fb = 0;
	    unless ($fb == 1){ # check uni -> enc
		if (exists $U2E{$uni}){
		    nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
		}else{
		    $U2E{$uni} = $enc;
		    if ($Opt{e} and $fb != 3) {
			my $e = hex2enc($enc);
			my $u = hex2uni($uni);
			my $eu = Encode::encode($name, $u);
			$e eq $eu
			    or nit qq(encode('$name', $uni) != $enc);
		    }
		}
	    }
	    unless ($fb == 3){  # check enc -> uni
		if (exists $E2U{$enc}){
		    nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
		}else{
		    $E2U{$enc} = $uni;
		    if ($Opt{e} and $fb != 1) {
			my $e = hex2enc($enc);
			my $u = hex2uni($uni);
			$Opt{D} and warn "$uni, $enc";
			my $de = Encode::decode($name, $e);
			$de eq $u
			    or nit qq(decode('$name', $enc) != $uni);
		    }
		}
	    }
	    # warn "$uni, $enc, $fb";
	}
    }
    $in_charmap or nit "Where is CHARMAP?";
    checkRT();
    printf ("$ARGV: %s error%s found\n", 
	    ($nerror == 0 ? 'no' : $nerror),
	    ($nerror > 1 ? 's' : ''));
}

exit;

sub hex2enc{
    pack("C*", map {hex($_)} split(",", shift));
}
sub hex2uni{
    join("", map { chr(hex($_)) } split(",", shift));
}

sub checkRT{
    for my $uni (keys %E2U){
	my $enc = $U2E{$uni} or next; # okay
	$E2U{$U2E{$uni}} eq $uni or
	    nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
    }
    for my $enc (keys %E2U){
	my $uni =  $E2U{$enc} or next; # okay
	$U2E{$E2U{$enc}} eq $enc or
	    nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
    }
}

sub uniparse{
    my $str = shift;
    my @u;
    push @u, $1 while($str =~ /\G<U(.*?)>/ig);
    for my $u (@u){
	$u =~ /^([0-9A-Za-z]+)$/o
	    or nit "malformed Unicode character: $u";
    }
    return join(',', @u);
}

sub encparse{
    my $str = shift;
    my @e;
    for my $e (split /\\x/io, $str){
	$e or next; # first \x
	$e =~ /^([0-9A-Za-z]{1,2})$/io
	    or nit "Hex $e in $str is bogus";
	push @e, $1;
    }
    return join(',', @e);
}

__END__

A UCM file looks like this.

  #
  # Comments
  #
  <code_set_name> "US-ascii" # Required
  <code_set_alias> "ascii"   # Optional
  <mb_cur_min> 1             # Required; usually 1
  <mb_cur_max> 1             # Max. # of bytes/char
  <subchar> \x3F             # Substitution char
  #
  CHARMAP
  <U0000> \x00 |0 # <control>
  <U0001> \x01 |0 # <control>
  <U0002> \x02 |0 # <control>
  ....
  <U007C> \x7C |0 # VERTICAL LINE
  <U007D> \x7D |0 # RIGHT CURLY BRACKET
  <U007E> \x7E |0 # TILDE
  <U007F> \x7F |0 # <control>
  END CHARMAP

--- NEW FILE: enc2xs ---
#!./perl
BEGIN {
    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
    # with $ENV{PERL_CORE} set
    # In case we need it in future...
    require Config; import Config;
}
use strict;
use warnings;
use Getopt::Std;
my @orig_ARGV = @ARGV;
our $VERSION  = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

# These may get re-ordered.
# RAW is a do_now as inserted by &enter
# AGG is an aggreagated do_now, as built up by &process

use constant {
  RAW_NEXT => 0,
[...1346 lines suppressed...]
      1356  average shared memory size
     18566  average unshared data size
       229  average unshared stack size
     46080  page reclaims
     33373  page faults

With %seen holding simple scalars:

      342.16 real        27.11 user         3.54 sys
      8388  maximum resident set size
      1394  average shared memory size
     14969  average unshared data size
       236  average unshared stack size
     28159  page reclaims
      9839  page faults

Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
how %seen is storing things its seen. So it is pathalogically bad on a 16M
RAM machine, but it's going to help even on modern machines.
Swapping is bad, m'kay :-)