dslinux/user/perl/ext/Encode/t Aliases.t CJKT.t Encode.t Encoder.t Mod_EUCJP.pm Unicode.t at-cn.t at-tw.t big5-eten.enc big5-eten.utf big5-hkscs.enc big5-hkscs.utf enc_data.t enc_eucjp.t enc_module.enc enc_module.t enc_utf8.t encoding.t fallback.t gb2312.enc gb2312.utf grow.t gsm0338.t guess.t jisx0201.enc jisx0201.utf jisx0208.enc jisx0208.utf jisx0212.enc jisx0212.utf jperl.t ksc5601.enc ksc5601.utf mime-header.t mime_header_iso2022jp.t perlio.t rt.pl unibench.pl utf8strict.t
cayenne
dslinux_cayenne at user.in-berlin.de
Tue Dec 5 05:26:40 CET 2006
Update of /cvsroot/dslinux/dslinux/user/perl/ext/Encode/t
In directory antilope:/tmp/cvs-serv7729/ext/Encode/t
Added Files:
Aliases.t CJKT.t Encode.t Encoder.t Mod_EUCJP.pm Unicode.t
at-cn.t at-tw.t big5-eten.enc big5-eten.utf big5-hkscs.enc
big5-hkscs.utf enc_data.t enc_eucjp.t enc_module.enc
enc_module.t enc_utf8.t encoding.t fallback.t gb2312.enc
gb2312.utf grow.t gsm0338.t guess.t jisx0201.enc jisx0201.utf
jisx0208.enc jisx0208.utf jisx0212.enc jisx0212.utf jperl.t
ksc5601.enc ksc5601.utf mime-header.t mime_header_iso2022jp.t
perlio.t rt.pl unibench.pl utf8strict.t
Log Message:
Adding fresh perl source to HEAD to branch from
--- NEW FILE: big5-hkscs.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: jperl.t ---
#
# $Id: jperl.t,v 1.1 2006-12-05 04:26:38 dslinux_cayenne Exp $
#
# This script is written in euc-jp
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
no utf8; # we have raw Japanese encodings here
use strict;
#use Test::More tests => 18;
use Test::More tests => 15; # black magic tests commented out
my $Debug = shift;
no encoding; # ensure
my $Enamae = "\xbe\xae\xbb\xf4\x20\xc3\xc6"; # euc-jp, with \x escapes
use encoding "euc-jp";
my $Namae = "¾®»ô ÃÆ"; # in Japanese, in euc-jp
my $Name = "Dan Kogai"; # in English
# euc-jp in \x format but after the pragma. But this one will be converted!
my $Ynamae = "\xbe\xae\xbb\xf4\x20\xc3\xc6";
my $str = $Namae; $str =~ s/¾®»ô ÃÆ/Dan Kogai/o;
is($str, $Name, q{regex});
$str = $Namae; $str =~ s/$Namae/Dan Kogai/o;
is($str, $Name, q{regex - with variable});
is(length($Namae), 4, q{utf8:length});
{
use bytes;
# converted to UTF-8 so 3*3+1
is(length($Namae), 10, q{bytes:length});
#
is(length($Enamae), 7, q{euc:length}); # 2*3+1
is ($Namae, $Ynamae, q{literal conversions});
isnt($Enamae, $Ynamae, q{before and after});
is($Enamae, Encode::encode('euc-jp', $Namae));
}
# let's test the scope as well. Must be in utf8 realm
is(length($Namae), 4, q{utf8:length});
{
no encoding;
ok(! defined(${^ENCODING}), q{no encoding;});
}
# should've been isnt() but no scoping is suported -- yet
ok(! defined(${^ENCODING}), q{not scoped yet});
#
# The following tests are commented out to accomodate
# Inaba-San's patch to make tr/// work w/o eval qq{}
#{
# # now let's try some real black magic!
# local(${^ENCODING}) = Encode::find_encoding("euc-jp");
# my $str = "\xbe\xae\xbb\xf4\x20\xc3\xc6";
# is (length($str), 4, q{black magic:length});
# is ($str, $Enamae, q{black magic:eq});
#}
#ok(! defined(${^ENCODING}), q{out of black magic});
use bytes;
is (length($Namae), 10);
#
# now something completely different!
#
{
use encoding "euc-jp", Filter=>1;
ok(1, "Filter on");
use utf8;
no strict 'vars'; # fools
# doesn't work w/ "my" as of this writing.
# because of buggy strict.pm and utf8.pm
our $¿Í = 2;
# ^^U+4eba, "human" in CJK ideograph
$¿Í++; # a child is born
*people = \$¿Í;
is ($people, 3, "Filter:utf8 identifier");
no encoding;
ok(1, "Filter off");
}
1;
__END__
--- NEW FILE: enc_module.t ---
# $Id: enc_module.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
# This file is in euc-jp
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (defined ${^UNICODE} and ${^UNICODE} != 0){
print "1..0 # Skip: \${^UNICODE} == ${^UNICODE}\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
}
use lib qw(t ext/Encode/t ../ext/Encode/t); # latter 2 for perl core
use Mod_EUCJP;
use encoding "euc-jp";
use Test::More tests => 3;
use File::Basename;
use File::Spec;
use File::Compare qw(compare_text);
my $DEBUG = shift || 0;
my $dir = dirname(__FILE__);
my $file0 = File::Spec->catfile($dir,"enc_module.enc");
my $file1 = File::Spec->catfile($dir,"$$.enc");
my $obj = Mod_EUCJP->new;
local $SIG{__WARN__} = sub{ $DEBUG and print STDERR @_ };
# to silence reopening STD(IN|OUT) w/o closing unless $DEBUG
open STDOUT, ">", $file1 or die "$file1:$!";
print $obj->str, "\n";
$obj->set("¥Æ¥¹¥Èʸ»úÎó");
print $obj->str, "\n";
# Please do not move this to a point after the comparison -- Craig Berry
# and "unless $^O eq 'freebsd'" is needed for FreeBSD (toy-)?thread
# -- dankogai
close STDOUT unless $^O eq 'freebsd';
my $cmp = compare_text($file0, $file1);
is($cmp, 0, "encoding vs. STDOUT");
my @cmp = qw/½é´üʸ»úÎó ¥Æ¥¹¥Èʸ»úÎó/;
open STDIN, "<", $file0 or die "$file0:$!";
$obj = Mod_EUCJP->new;
my $i = 0;
while(<STDIN>){
s/\r?\n\z//;
is ($cmp[$i++], $_, "encoding vs. STDIN - $i");
}
unlink $file1 unless $cmp;
__END__
--- NEW FILE: at-cn.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
use strict;
use Test::More tests => 29;
use Encode;
no utf8; # we have raw Chinese encodings here
use_ok('Encode::CN');
# Since JP.t already tests basic file IO, we will just focus on
# internal encode / decode test here. Unfortunately, to test
# against all the UniHan characters will take a huge disk space,
# not to mention the time it will take, and the fact that Perl
# did not bundle UniHan.txt anyway.
# So, here we just test a typical snippet spanning multiple Unicode
# blocks, and hope it can point out obvious errors.
run_tests('Simplified Chinese only', {
'utf' => (
12298.26131.32463.12299.31532.19968.21350.
24406.26352.65306.
22823.21705.20094.20803.65292.19975.29289.36164.22987.65292.
20035.32479.22825.12290.
20113.34892.38632.26045.65292.21697.29289.27969.24418.12290.
22823.26126.22987.32456.65292.20845.20301.26102.25104.65292.
26102.20056.20845.40857.20197.24481.22825.12290.
20094.36947.21464.21270.65292.21508.27491.24615.21629.65292.
20445.21512.22823.21644.65292.20035.21033.36126.12290.
39318.20986.24246.29289.65292.19975.22269.21688.23425.12290
),
'euc-cn' => join('',
'¡¶Ò×¾¡·µÚÒ»ØÔ',
'åèÔ»£º',
'´óÔÕǬԪ£¬ÍòÎï×Êʼ£¬',
'ÄËͳÌì¡£',
'ÔÆÐÐÓêÊ©£¬Æ·ÎïÁ÷ÐΡ£',
'´óÃ÷ʼÖÕ£¬Áùλʱ³É£¬',
'ʱ³ËÁùÁúÒÔÓùÌì¡£',
'ǬµÀ±ä»¯£¬¸÷ÕýÐÔÃü£¬',
'±£ºÏ´óºÍ£¬ÄËÀûÕê¡£',
'Ê׳öÊüÎÍò¹úÏÌÄþ¡£',
),
'gb2312-raw' => join('',
'!6RW>-!75ZR;XT',
'ehT;#:',
'4sTUG,T*#,MrNoWJJ<#,',
'DKM3Ll!#',
'TFPPSjJ)#,F7NoAwPN!#',
'4sCwJ<VU#,AyN;J13I#,',
'J13KAyAzRTSyLl!#',
'G,5 at 1d;/#,8wU}PTC|#,',
'1#:O4s:M#,DK@{Uj!#',
'JW3vJ|No#,Mr9zOLD~!#'
),
'iso-ir-165'=> join('',
'!6RW>-!75ZR;XT',
'ehT;#:',
'4sTUG,T*#,MrNoWJJ<#,',
'DKM3Ll!#',
'TFPPSjJ)#,F7NoAwPN!#',
'4sCwJ<VU#,AyN;J13I#,',
'J13KAyAzRTSyLl!#',
'G,5 at 1d;/#,8wU}PTC|#,',
'1#:O4s:M#,DK@{Uj!#',
'JW3vJ|No#,Mr9zOLD~!#'
),
});
run_tests('Simplified Chinese + ASCII', {
'utf' => (
35937.26352.65306.10.
22825.34892.20581.65292.21531.23376.20197.33258.24378.19981.24687.12290.10.
28508.40857.21247.29992.65292.38451.22312.19979.20063.12290.32.
35265.40857.22312.30000.65292.24503.26045.26222.20063.12290.32.
32456.26085.20094.20094.65292.21453.22797.36947.20063.12290.10.
25110.36291.22312.28170.65292.36827.26080.21646.20063.12290.39134.
40857.22312.22825.65292.22823.20154.36896.20063.12290.32.
20130.40857.26377.24724.65292.30408.19981.21487.20037.20063.12290.10.
29992.20061.65292.22825.24503.19981.21487.20026.39318.20063.12290
),
'cp936' => join(chr(10),
'ÏóÔ»£º',
'ÌìÐн¡£¬¾ý×ÓÒÔ×ÔÇ¿²»Ï¢¡£',
'DZÁúÎðÓã¬ÑôÔÚÏÂÒ²¡£ ¼ûÁúÔÚÌµÂÊ©ÆÕÒ²¡£ ÖÕÈÕǬǬ£¬·´¸´µÀÒ²¡£',
'»òÔ¾ÔÚÔ¨£¬½øÎÞ¾ÌÒ²¡£·ÉÁúÔÚÌ죬´óÈËÔìÒ²¡£ ¿ºÁúÓлڣ¬Ó¯²»¿É¾ÃÒ²¡£',
'Óþţ¬ÌìµÂ²»¿ÉΪÊ×Ò²¡£',
),
'hz' => join(chr(10),
'~{OsT;#:~}',
'~{LlPP=!#,>}WSRTWTG?2;O"!#~}',
'~{G1AzNpSC#,QtTZOBR2!#~} ~{<{AzTZLo#,5BJ)FUR2!#~} ~{VUHUG,G,#,74845 at R2!#~}',
'~{;rT>TZT(#,=xN^>LR2!#7IAzTZLl#,4sHKTlR2!#~} ~{?:AzSP;Z#,S/2;?I>CR2!#~}',
'~{SC>E#,Ll5B2;?IN*JWR2!#~}',
),
});
run_tests('Traditional Chinese', {
'utf', => 20094.65306.20803.12289.20136.12289.21033.12289.35998,
'gb12345-raw' => 'G,#:T*!":`!"@{!"Uj',
'gbk' => 'Ǭ£ºÔª¡¢ºà¡¢Àû¡¢Ø',
});
sub run_tests {
my ($title, $tests) = @_;
my $utf = delete $tests->{'utf'};
# $enc = encoding, $str = content
foreach my $enc (sort keys %{$tests}) {
my $str = $tests->{$enc};
is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title");
is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title");
my $str2 = $str;
my $utf8 = Encode::encode('utf-8', $utf);
Encode::from_to($str2, $enc, 'utf-8');
is($str2, $utf8, "[$enc] from_to => utf8 - $title");
Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc
is($utf8, $str, "[$enc] utf8 => from_to - $title");
}
}
--- NEW FILE: Mod_EUCJP.pm ---
# $Id: Mod_EUCJP.pm,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
# This file is in euc-jp
package Mod_EUCJP;
use encoding "euc-jp";
sub new {
my $class = shift;
my $str = shift || qw/½é´üʸ»úÎó/;
my $self = bless {
str => '',
}, $class;
$self->set($str);
$self;
}
sub set {
my ($self,$str) = @_;
$self->{str} = $str;
$self;
}
sub str { shift->{str}; }
sub put { print shift->{str}; }
1;
__END__
--- NEW FILE: rt.pl ---
#!/usr/local/bin/perl
#
# $Id: rt.pl,v 1.1 2006-12-05 04:26:38 dslinux_cayenne Exp $
#
BEGIN {
my $ucmdir = "ucm";
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
$ucmdir = "../ext/Encode/ucm";
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
use strict;
require Test::More;
our $DEBUG;
our @ucm;
unless(@ARGV){
use File::Spec;
Test::More->import(tests => 103);
opendir my $dh, $ucmdir or die "$ucmdir:$!";
@ucm =
map {File::Spec->catfile($ucmdir, $_) }
sort grep {/\.ucm$/o} readdir($dh);
closedir $dh;
}else{
Test::More->import("no_plan");
$DEBUG = 1;
@ucm = @ARGV;
}
}
use strict;
use Encode qw/encode decode/;
our $DEBUG;
our @ucm;
for my $ucm (@ucm){
my ($name, $nchar, $nrt, $nok) = rttest($ucm);
$nok += 0;
ok($nok == 0, "$ucm => $name ($nchar, $nrt, $nok)");
}
sub rttest{
my $ucm = shift;
my ($name, $nchar, $nrt, $nok);
open my $rfh, "<$ucm" or die "$ucm:$!";
# <U0000> \x00 |0 # <control>
while(<$rfh>){
s/#.*//o; /^$/ and next;
unless ($name){
/^<code_set_name>\s+"([^\"]+)"/io or next;
$name = $1 and next;
}else{
/^<U([0-9a-f]+)>\s+(\S+)\s+\|(\d)/io or next;
$nchar++;
$3 == 0 or next;
$nrt++;
my $uni = chr(hex($1));
my $enc = eval qq{ "$2" };
decode($name, $enc) eq $uni or $nok++;
encode($name, $uni) eq $enc or $nok++;
}
}
return($name, $nchar, $nrt, $nok);
}
__END__
--- NEW FILE: fallback.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
use strict;
#use Test::More qw(no_plan);
use Test::More tests => 44;
use Encode q(:all);
my $uo = '';
my $nf = '';
my ($af, $aq, $ap, $ah, $ax, $uf, $uq, $up, $uh, $ux, $ac, $uc);
for my $i (0x20..0x7e){
$uo .= chr($i);
}
$af = $aq = $ap = $ah = $ax = $ac =
$uf = $uq = $up = $uh = $ux = $uc =
$nf = $uo;
my $residue = '';
for my $i (0x80..0xff){
$uo .= chr($i);
$residue .= chr($i);
$af .= '?';
$uf .= "\x{FFFD}";
$ap .= sprintf("\\x{%04x}", $i);
$up .= sprintf("\\x%02X", $i);
$ah .= sprintf("&#%d;", $i);
$uh .= sprintf("\\x%02X", $i);
$ax .= sprintf("&#x%x;", $i);
$ux .= sprintf("\\x%02X", $i);
$ac .= sprintf("<U+%04X>", $i);
$uc .= sprintf("[%02X]", $i);
}
my $ao = $uo;
utf8::upgrade($uo);
my $ascii = find_encoding('ascii');
my $utf8 = find_encoding('utf8');
my $src = $uo;
my $dst = $ascii->encode($src, FB_DEFAULT);
is($dst, $af, "FB_DEFAULT ascii");
is($src, $uo, "FB_DEFAULT residue ascii");
$src = $ao;
$dst = $utf8->decode($src, FB_DEFAULT);
is($dst, $uf, "FB_DEFAULT utf8");
is($src, $ao, "FB_DEFAULT residue utf8");
$src = $uo;
eval{ $dst = $ascii->encode($src, FB_CROAK) };
like($@, qr/does not map to ascii/o, "FB_CROAK ascii");
is($src, $uo, "FB_CROAK residue ascii");
$src = $ao;
eval{ $dst = $utf8->decode($src, FB_CROAK) };
like($@, qr/does not map to Unicode/o, "FB_CROAK utf8");
is($src, $ao, "FB_CROAK residue utf8");
$src = $nf;
eval{ $dst = $ascii->encode($src, FB_CROAK) };
is($@, '', "FB_CROAK on success ascii");
is($src, '', "FB_CROAK on success residue ascii");
$src = $nf;
eval{ $dst = $utf8->decode($src, FB_CROAK) };
is($@, '', "FB_CROAK on success utf8");
is($src, '', "FB_CROAK on success residue utf8");
$src = $uo;
$dst = $ascii->encode($src, FB_QUIET);
is($dst, $aq, "FB_QUIET ascii");
is($src, $residue, "FB_QUIET residue ascii");
$src = $ao;
$dst = $utf8->decode($src, FB_QUIET);
is($dst, $uq, "FB_QUIET utf8");
is($src, $residue, "FB_QUIET residue utf8");
{
my $message = '';
local $SIG{__WARN__} = sub { $message = $_[0] };
$src = $uo;
$dst = $ascii->encode($src, FB_WARN);
is($dst, $aq, "FB_WARN ascii");
is($src, $residue, "FB_WARN residue ascii");
like($message, qr/does not map to ascii/o, "FB_WARN message ascii");
$message = '';
$src = $ao;
$dst = $utf8->decode($src, FB_WARN);
is($dst, $uq, "FB_WARN utf8");
is($src, $residue, "FB_WARN residue utf8");
like($message, qr/does not map to Unicode/o, "FB_WARN message utf8");
$message = '';
$src = $uo;
$dst = $ascii->encode($src, WARN_ON_ERR);
is($dst, $af, "WARN_ON_ERR ascii");
is($src, '', "WARN_ON_ERR residue ascii");
like($message, qr/does not map to ascii/o, "WARN_ON_ERR message ascii");
$message = '';
$src = $ao;
$dst = $utf8->decode($src, WARN_ON_ERR);
is($dst, $uf, "WARN_ON_ERR utf8");
is($src, '', "WARN_ON_ERR residue utf8");
like($message, qr/does not map to Unicode/o, "WARN_ON_ERR message ascii");
}
$src = $uo;
$dst = $ascii->encode($src, FB_PERLQQ);
is($dst, $ap, "FB_PERLQQ encode");
is($src, $uo, "FB_PERLQQ residue encode");
$src = $ao;
$dst = $ascii->decode($src, FB_PERLQQ);
is($dst, $up, "FB_PERLQQ decode");
is($src, $ao, "FB_PERLQQ residue decode");
$src = $uo;
$dst = $ascii->encode($src, FB_HTMLCREF);
is($dst, $ah, "FB_HTMLCREF encode");
is($src, $uo, "FB_HTMLCREF residue encode");
$src = $ao;
$dst = $ascii->decode($src, FB_HTMLCREF);
is($dst, $uh, "FB_HTMLCREF decode");
is($src, $ao, "FB_HTMLCREF residue decode");
$src = $uo;
$dst = $ascii->encode($src, FB_XMLCREF);
is($dst, $ax, "FB_XMLCREF encode");
is($src, $uo, "FB_XMLCREF residue encode");
$src = $ao;
$dst = $ascii->decode($src, FB_XMLCREF);
is($dst, $ux, "FB_XMLCREF decode");
is($src, $ao, "FB_XMLCREF residue decode");
$src = $uo;
$dst = $ascii->encode($src, sub{ sprintf "<U+%04X>", shift });
is($dst, $ac, "coderef encode");
is($src, $uo, "coderef residue encode");
$src = $ao;
$dst = $ascii->decode($src, sub{ sprintf "[%02X]", shift });
is($dst, $uc, "coderef decode");
is($src, $ao, "coderef residue decode");
--- NEW FILE: enc_eucjp.t ---
# $Id: enc_eucjp.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
# This is the twin of enc_utf8.t .
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
if ($] <= 5.008 and !$Config{perl_patchlevel}){
print "1..0 # Skip: Perl 5.8.1 or later required\n";
exit 0;
}
}
use encoding 'euc-jp';
my @c = (127, 128, 255, 256);
print "1.." . (scalar @c + 1) . "\n";
my @f;
for my $i (0..$#c) {
no warnings 'pack';
my $file = filename("f$i");
push @f, $file;
open(F, ">$file") or die "$0: failed to open '$file' for writing: $!";
binmode(F, ":utf8");
print F chr($c[$i]);
print F pack("C" => $c[$i]);
close F;
}
my $t = 1;
for my $i (0..$#c) {
my $file = filename("f$i");
open(F, "<$file") or die "$0: failed to open '$file' for reading: $!";
binmode(F, ":utf8");
my $c = <F>;
my $o = ord($c);
print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n";
$t++;
}
my $f = filename("f" . @f);
push @f, $f;
open(F, ">$f") or die "$0: failed to open '$f' for writing: $!";
binmode(F, ":raw"); # Output raw bytes.
print F chr(128); # Output illegal UTF-8.
close F;
open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf-8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
eval { <F> }; # This should get caught.
}
close F;
print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
"ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
# On VMS temporary file names like "f0." may be more readable than "f0" since
# "f0" could be a logical name pointing elsewhere.
sub filename {
my $name = shift;
$name .= '.' if $^O eq 'VMS';
return $name;
}
END {
1 while unlink @f;
}
--- NEW FILE: mime-header.t ---
#
# $Id: mime-header.t,v 1.1 2006-12-05 04:26:38 dslinux_cayenne Exp $
# This script is written in utf8
#
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
no utf8;
use strict;
#use Test::More qw(no_plan);
use Test::More tests => 10;
use_ok("Encode::MIME::Header");
my $eheader =<<'EOS';
From: =?US-ASCII?Q?Keith_Moore?= <moore at cs.utk.edu>
To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld at dkuug.dk>
CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD at vm1.ulg.ac.be>
Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
EOS
my $dheader=<<"EOS";
From: Keith Moore <moore\@cs.utk.edu>
To: Keld J\xF8rn Simonsen <keld\@dkuug.dk>
CC: Andr\xE9 Pirard <PIRARD\@vm1.ulg.ac.be>
Subject: If you can read this you understand the example.
EOS
is(Encode::decode('MIME-Header', $eheader), $dheader, "decode ASCII (RFC2047)");
use utf8;
my $uheader =<<'EOS';
From: =?US-ASCII?Q?Keith_Moore?= <moore at cs.utk.edu>
To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld at dkuug.dk>
CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD at vm1.ulg.ac.be>
Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
EOS
is(Encode::decode('MIME-Header', $uheader), $dheader, "decode UTF-8 (RFC2047)");
$dheader=<<'EOS';
From: å°é£¼ å¼¾ <dankogai at dan.co.jp>
To: dankogai at dan.co.jp (å°é£¼=Kogai, å¼¾=Dan)
Subject: æ¼¢åãã«ã¿ã«ããã²ãããªãå«ããé常ã«é·ãã¿ã¤ãã«è¡ãä¸ä½å
¨ä½ã©ã®ããã«ãã¦Encodeãããã®ãï¼
EOS
my $bheader =<<'EOS';
From:=?UTF-8?B?IOWwj+mjvCDlvL4g?=<dankogai at dan.co.jp>
To: dankogai at dan.co.jp (=?UTF-8?B?5bCP6aO8?==Kogai,=?UTF-8?B?IOW8vg==?==Dan
)
Subject:
=?UTF-8?B?IOa8ouWtl+OAgeOCq+OCv+OCq+ODiuOAgeOBsuOCieOBjOOBquOCkuWQq+OCgA==?=
=?UTF-8?B?44CB6Z2e5bi444Gr6ZW344GE44K/44Kk44OI44Or6KGM44GM5LiA5L2T5YWo?=
=?UTF-8?B?5L2T44Gp44Gu44KI44GG44Gr44GX44GmRW5jb2Rl44GV44KM44KL44Gu44GL?=
=?UTF-8?B?77yf?=
EOS
my $qheader=<<'EOS';
From:=?UTF-8?Q?=20=E5=B0=8F=E9=A3=BC=20=E5=BC=BE=20?=<dankogai at dan.co.jp>
To: dankogai at dan.co.jp (=?UTF-8?Q?=E5=B0=8F=E9=A3=BC?==Kogai,
=?UTF-8?Q?=20=E5=BC=BE?==Dan)
Subject:
=?UTF-8?Q?=20=E6=BC=A2=E5=AD=97=E3=80=81=E3=82=AB=E3=82=BF=E3=82=AB?=
=?UTF-8?Q?=E3=83=8A=E3=80=81=E3=81=B2=E3=82=89=E3=81=8C=E3=81=AA=E3=82=92?=
=?UTF-8?Q?=E5=90=AB=E3=82=80=E3=80=81=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7?=
=?UTF-8?Q?=E3=81=84=E3=82=BF=E3=82=A4=E3=83=88=E3=83=AB=E8=A1=8C=E3=81=8C?=
=?UTF-8?Q?=E4=B8=80=E4=BD=93=E5=85=A8=E4=BD=93=E3=81=A9=E3=81=AE=E3=82=88?=
=?UTF-8?Q?=E3=81=86=E3=81=AB=E3=81=97=E3=81=A6Encode=E3=81=95?=
=?UTF-8?Q?=E3=82=8C=E3=82=8B=E3=81=AE=E3=81=8B=EF=BC=9F?=
EOS
is(Encode::decode('MIME-Header', $bheader), $dheader, "decode B");
is(Encode::decode('MIME-Header', $qheader), $dheader, "decode Q");
is(Encode::encode('MIME-B', $dheader)."\n", $bheader, "encode B");
is(Encode::encode('MIME-Q', $dheader)."\n", $qheader, "encode Q");
$dheader = "What is =?UTF-8?B?w4RwZmVs?= ?";
$bheader = "What is =?UTF-8?B?PT9VVEYtOD9CP3c0UndabVZzPz0=?= ?";
$qheader = "What is =?UTF-8?Q?=3D=3FUTF=2D8=3FB=3Fw4RwZmVs=3F=3D?= ?";
is(Encode::encode('MIME-B', $dheader), $bheader, "Double decode B");
is(Encode::encode('MIME-Q', $dheader), $qheader, "Double decode Q");
{
# From: Dave Evans <dave at rudolf.org.uk>
# Subject: Bug in Encode::MIME::Header
# Message-Id: <3F43440B.7060606 at rudolf.org.uk>
use charnames ":full";
my $pound_1024 = "\N{POUND SIGN}1024";
is(Encode::encode('MIME-Q' => $pound_1024), '=?UTF-8?Q?=C2=A31024?=',
'pound 1024');
}
__END__;
--- NEW FILE: enc_module.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: grow.t ---
#!../perl
our $POWER;
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
$POWER = 12; # up to 1 MB. You may adjust the figure here
}
use strict;
use Encode;
my $seed = "";
for my $i (0x00..0xff){
my $c = chr($i);
$seed .= ($c =~ /^\p{IsPrint}/o) ? $c : " ";
}
use Test::More tests => $POWER*2;
my $octs = $seed;
use bytes ();
for my $i (1..$POWER){
$octs .= $octs;
my $len = bytes::length($octs);
my $utf8 = Encode::decode('latin1', $octs);
ok(1, "decode $len bytes");
is($octs,
Encode::encode('latin1', $utf8),
"encode $len bytes");
}
__END__
--- NEW FILE: Encode.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
}
use strict;
use Test;
use Encode qw(from_to encode decode
encode_utf8 decode_utf8
find_encoding is_utf8);
use charnames qw(greek);
my @encodings = grep(/iso-?8859/,Encode::encodings());
my $n = 2;
my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z');
my @source = qw(ascii iso8859-1 cp1250);
my @destiny = qw(cp1047 cp37 posix-bc);
my @ebcdic_sets = qw(cp1047 cp37 posix-bc);
plan test => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 2;
my $str = join('',map(chr($_),0x20..0x7E));
my $cpy = $str;
ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
ok($cpy,$str,"ASCII mangled by translating from iso8859-1 to Unicode");
$cpy = $str;
ok(from_to($cpy,'Unicode','iso8859-1'),length($str),"Length wrong");
ok($cpy,$str,"ASCII mangled by translating from Unicode to iso8859-1");
$str = join('',map(chr($_),0xa0..0xff));
$cpy = $str;
ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
my $sym = Encode->getEncoding('symbol');
my $uni = $sym->decode(encode(ascii => 'a'));
ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'");
$str = $sym->encode("\N{Beta}");
ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta");
foreach my $enc (qw(symbol dingbats ascii), at encodings)
{
my $tab = Encode->getEncoding($enc);
ok(1,defined($tab),"Could not load $enc");
$str = join('',map(chr($_),0x20..0x7E));
$uni = $tab->decode($str);
$cpy = $tab->encode($uni);
ok($cpy,$str,"$enc mangled translating to Unicode and back");
}
# On ASCII based machines see if we can map several codepoints from
# three distinct ASCII sets to three distinct EBCDIC coded character sets.
# On EBCDIC machines see if we can map from three EBCDIC sets to three
# distinct ASCII sets.
my @expectation = (240..249, 193..201,209..217,226..233, 129..137,145..153,162..169);
if (ord('A') != 65) {
my @temp = @destiny;
@destiny = @source;
@source = @temp;
undef(@temp);
@expectation = (48..57, 65..90, 97..122);
}
foreach my $to (@destiny)
{
foreach my $from (@source)
{
my @expected = @expectation;
foreach my $chr (@character_set)
{
my $native_chr = $chr;
my $cpy = $chr;
my $rc = from_to($cpy,$from,$to);
ok(1,$rc,"Could not translate from $from to $to");
ok(ord($cpy),shift(@expected),"mangled translating $native_chr from $from to $to");
}
}
}
# On either ASCII or EBCDIC machines ensure we can take the full one
# byte repetoire to EBCDIC sets and back.
my $enc_as = 'iso8859-1';
foreach my $enc_eb (@ebcdic_sets)
{
foreach my $ord (0..255)
{
$str = chr($ord);
my $rc = from_to($str,$enc_as,$enc_eb);
$rc += from_to($str,$enc_eb,$enc_as);
ok($rc,2,"return code for $ord $enc_eb -> $enc_as -> $enc_eb was not obtained");
ok($ord,ord($str),"$enc_as mangled translating $ord to $enc_eb and back");
}
}
my $mime = find_encoding('iso-8859-2');
ok(defined($mime),1,"Cannot find MIME-ish'iso-8859-2'");
my $x11 = find_encoding('iso8859-2');
ok(defined($x11),1,"Cannot find X11-ish 'iso8859-2'");
ok($mime,$x11,"iso8598-2 and iso-8859-2 not same");
my $spc = find_encoding('iso 8859-2');
ok(defined($spc),1,"Cannot find 'iso 8859-2'");
ok($spc,$mime,"iso 8859-2 and iso-8859-2 not same");
for my $i (256,128,129,256)
{
my $c = chr($i);
my $s = "$c\n".sprintf("%02X",$i);
ok(utf8::valid($s),1,"concat of $i botched");
utf8::upgrade($s);
ok(utf8::valid($s),1,"concat of $i botched");
}
# Spot check a few points in/out of utf8
for my $i (ord('A'),128,256,0x20AC)
{
my $c = chr($i);
my $o = encode_utf8($c);
ok(decode_utf8($o),$c,"decode_utf8 not inverse of encode_utf8 for $i");
ok(encode('utf8',$c),$o,"utf8 encode by name broken for $i");
ok(decode('utf8',$o),$c,"utf8 decode by name broken for $i");
}
# is_utf8
ok( is_utf8("\x{100}"));
ok(! is_utf8("a"));
ok(! is_utf8(""));
"\x{100}" =~ /(.)/;
ok( is_utf8($1)); # ID 20011127.151
$a = $1;
ok( is_utf8($a));
$a = "\x{100}";
chop $a;
ok( is_utf8($a)); # weird but true: an empty UTF-8 string
# non-string arguments
package Encode::Dummy;
use overload q("") => sub { $_[0]->[0] };
sub new { my $class = shift; bless [ @_ ] => $class }
package main;
ok(decode(latin1 => Encode::Dummy->new("foobar")), "foobar");
ok(encode(utf8 => Encode::Dummy->new("foobar")), "foobar");
--- NEW FILE: mime_header_iso2022jp.t ---
use Test::More tests => 14;
use strict;
use Encode;
BEGIN{
use_ok('Encode::MIME::Header::ISO_2022_JP');
}
require_ok('Encode::MIME::Header::ISO_2022_JP');
# below codes are from mime.t in Jcode
my %mime = (
"´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê"
=> "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKGyhC?=",
"foo bar"
=> "foo bar",
"´Á»ú¡¢¥«¥¿¥«¥Ê¡¢¤Ò¤é¤¬¤Ê¤Îº®¤¸¤Ã¤¿Subject Header."
=> "=?ISO-2022-JP?B?GyRCNEE7eiEiJSslPyUrJUohIiRSJGkkLCRKJE46LiQ4JEMkPxsoQlN1?=\n =?ISO-2022-JP?B?YmplY3Q=?= Header.",
);
for my $k (keys %mime){
$mime{"$k\n"} = $mime{$k} . "\n";
}
for my $decoded (sort keys %mime){
my $encoded = $mime{$decoded};
my $header = Encode::encode('MIME-Header-ISO_2022_JP', decode('euc-jp', $decoded));
my $utf8 = Encode::decode('MIME-Header', $header);
is(encode('euc-jp', $utf8), $decoded);
is($header, $encoded);
}
__END__
--- NEW FILE: ksc5601.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: gb2312.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: jisx0201.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: jisx0208.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: jisx0212.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: CJKT.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
# should work w/o PerlIO now!
# unless (PerlIO::Layer->find('perlio')){
# print "1..0 # Skip: PerlIO required\n";
# exit 0;
# }
$| = 1;
}
use strict;
use Test::More tests => 60;
use Encode;
use File::Basename;
use File::Spec;
use File::Compare qw(compare_text);
our $DEBUG = shift || 0;
my %Charset =
(
'big5-eten' => [qw(big5-eten)],
'big5-hkscs' => [qw(big5-hkscs)],
gb2312 => [qw(euc-cn hz)],
jisx0201 => [qw(euc-jp shiftjis 7bit-jis)],
jisx0208 => [qw(euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1)],
jisx0212 => [qw(euc-jp 7bit-jis iso-2022-jp-1)],
ksc5601 => [qw(euc-kr iso-2022-kr johab)],
);
my $dir = dirname(__FILE__);
my $seq = 1;
for my $charset (sort keys %Charset){
my ($src, $uni, $dst, $txt);
my $transcoder = find_encoding($Charset{$charset}[0]) or die;
my $src_enc = File::Spec->catfile($dir,"$charset.enc");
my $src_utf = File::Spec->catfile($dir,"$charset.utf");
my $dst_enc = File::Spec->catfile($dir,"$$.enc");
my $dst_utf = File::Spec->catfile($dir,"$$.utf");
open $src, "<$src_enc" or die "$src_enc : $!";
if (PerlIO::Layer->find('perlio')){
binmode($src, ":bytes"); # needed when :utf8 in default open layer
}
$txt = join('',<$src>);
close($src);
eval{ $uni = $transcoder->decode($txt, 1) };
$@ and print $@;
ok(defined($uni), "decode $charset"); $seq++;
is(length($txt),0, "decode $charset completely"); $seq++;
open $dst, ">$dst_utf" or die "$dst_utf : $!";
if (PerlIO::Layer->find('perlio')){
binmode($dst, ":utf8");
print $dst $uni;
}else{ # ugh!
binmode($dst);
my $raw = $uni; Encode::_utf8_off($raw);
print $dst $raw;
}
close($dst);
is(compare_text($dst_utf, $src_utf), 0, "$dst_utf eq $src_utf")
or ($DEBUG and rename $dst_utf, "$dst_utf.$seq");
$seq++;
open $src, "<$src_utf" or die "$src_utf : $!";
if (PerlIO::Layer->find('perlio')){
binmode($src, ":utf8");
$uni = join('', <$src>);
}else{ # ugh!
binmode($src);
$uni = join('', <$src>);
Encode::_utf8_on($uni);
}
close $src;
my $unisave = $uni;
eval{ $txt = $transcoder->encode($uni,1) };
$@ and print $@;
ok(defined($txt), "encode $charset"); $seq++;
is(length($uni), 0, "encode $charset completely"); $seq++;
$uni = $unisave;
open $dst,">$dst_enc" or die "$dst_utf : $!";
binmode($dst);
print $dst $txt;
close($dst);
is(compare_text($src_enc, $dst_enc), 0 => "$dst_enc eq $src_enc")
or ($DEBUG and rename $dst_enc, "$dst_enc.$seq");
$seq++;
unlink($dst_utf, $dst_enc);
for my $encoding (@{$Charset{$charset}}){
my $rt = decode($encoding, encode($encoding, $uni));
is ($rt, $uni, "RT $encoding");
}
}
--- NEW FILE: perlio.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
unless (PerlIO::Layer->find('perlio')){
print "1..0 # Skip: PerlIO required\n";
exit 0;
}
$| = 1;
}
use strict;
use File::Basename;
use File::Spec;
use File::Compare qw(compare_text);
use File::Copy;
use FileHandle;
#use Test::More qw(no_plan);
use Test::More tests => 38;
our $DEBUG = 0;
use Encode (":all");
{
no warnings;
@ARGV and $DEBUG = shift;
#require Encode::JP::JIS7;
#require Encode::KR::2022_KR;
#$Encode::JP::JIS7::DEBUG = $DEBUG;
}
my $seq = 0;
my $dir = dirname(__FILE__);
my %e =
(
jisx0208 => [ qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/],
ksc5601 => [ qw/euc-kr/],
gb2312 => [ qw/euc-cn hz/],
);
$/ = "\x0a"; # may fix VMS problem for test #28 and #29
for my $src (sort keys %e) {
my $ufile = File::Spec->catfile($dir,"$src.utf");
open my $fh, "<:utf8", $ufile or die "$ufile : $!";
my @uline = <$fh>;
my $utext = join('' => @uline);
close $fh;
for my $e (@{$e{$src}}){
my $sfile = File::Spec->catfile($dir,"$$.sio");
my $pfile = File::Spec->catfile($dir,"$$.pio");
# first create a file without perlio
dump2file($sfile, &encode($e, $utext, 0));
# then create a file via perlio without autoflush
SKIP:{
skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG);
no warnings 'uninitialized';
open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
$fh->autoflush(0);
print $fh $utext;
close $fh;
$seq++;
is(compare_text($sfile, $pfile), 0 => ">:encoding($e)");
if ($DEBUG){
copy $sfile, "$sfile.$seq";
copy $pfile, "$pfile.$seq";
}
# this time print line by line.
# works even for ISO-2022 but not ISO-2022-KR
open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
$fh->autoflush(1);
for my $l (@uline) {
print $fh $l;
}
close $fh;
$seq++;
is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines");
if ($DEBUG){
copy $sfile, "$sfile.$seq";
copy $pfile, "$pfile.$seq";
}
my $dtext;
open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
$fh->autoflush(0);
$dtext = join('' => <$fh>);
close $fh;
$seq++;
ok($utext eq $dtext, "<:encoding($e)");
if ($DEBUG){
dump2file("$sfile.$seq", $utext);
dump2file("$pfile.$seq", $dtext);
}
if (perlio_ok($e) or $DEBUG){
$dtext = '';
open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
while(defined(my $l = <$fh>)) {
$dtext .= $l;
}
close $fh;
}
$seq++;
ok($utext eq $dtext, "<:encoding($e) by lines");
if ($DEBUG){
dump2file("$sfile.$seq", $utext);
dump2file("$pfile.$seq", $dtext);
}
}
if ( ! $DEBUG ) {
1 while unlink ($sfile);
1 while unlink ($pfile);
}
}
}
# BOM Test
SKIP:{
my $pev = PerlIO::encoding->VERSION;
skip "PerlIO::encoding->VERSION = $pev <= 0.07 ", 6
unless ($pev >= 0.07 or $DEBUG);
my $file = File::Spec->catfile($dir,"jisx0208.utf");
open my $fh, "<:utf8", $file or die "$file : $!";
my $str = join('' => <$fh>);
close $fh;
my %bom = (
'UTF-16BE' => pack('n', 0xFeFF),
'UTF-16LE' => pack('v', 0xFeFF),
'UTF-32BE' => pack('N', 0xFeFF),
'UTF-32LE' => pack('V', 0xFeFF),
);
# reading
for my $utf (sort keys %bom){
my $bomed = $bom{$utf} . encode($utf, $str);
my $sfile = File::Spec->catfile($dir,".${utf}_${seq}_$$");
dump2file($sfile, $bomed);
my $utf_nobom = $utf; $utf_nobom =~ s/(LE|BE)$//o;
# reading
open $fh, "<:encoding($utf_nobom)", $sfile or die "$sfile : $!";
my $cmp = join '' => <$fh>;
close $fh;
is($str, $cmp, "<:encoding($utf_nobom) eq $utf");
unlink $sfile; $seq++;
}
# writing
for my $utf_nobom (qw/UTF-16 UTF-32/){
my $utf = $utf_nobom . 'BE';
my $sfile = File::Spec->catfile($dir,".${utf_nobom}_${seq}_$$");
my $bomed = $bom{$utf} . encode($utf, $str);
open $fh, ">:encoding($utf_nobom)", $sfile or die "$sfile : $!";
print $fh $str;
close $fh;
open my $fh, "<:bytes", $sfile or die "$sfile : $!";
read $fh, my $cmp, -s $sfile;
close $fh;
use bytes ();
ok($bomed eq $cmp, ">:encoding($utf_nobom) eq $utf");
unlink $sfile; $seq++;
}
}
sub dump2file{
no warnings;
open my $fh, ">", $_[0] or die "$_[0]: $!";
binmode $fh;
print $fh $_[1];
close $fh;
}
--- NEW FILE: at-tw.t ---
BEGIN {
if (! -d 'blib' and -d 't'){ chdir 't' };
unshift @INC, '../lib';
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
use strict;
use Test::More tests => 17;
use Encode;
no utf8; # we have raw Chinese encodings here
use_ok('Encode::TW');
# Since JP.t already tests basic file IO, we will just focus on
# internal encode / decode test here. Unfortunately, to test
# against all the UniHan characters will take a huge disk space,
# not to mention the time it will take, and the fact that Perl
# did not bundle UniHan.txt anyway.
# So, here we just test a typical snippet spanning multiple Unicode
# blocks, and hope it can point out obvious errors.
run_tests('Basic Big5 range', {
'utf' => (
24093.39640.38525.20043.33495.35028.20846.65292.
26389.30343.32771.26352.20271.24248.65108.
25885.25552.35998.20110.23391.38508.20846.65292.
24799.24218.23493.21566.20197.38477.65108
),
'big5' => (join('',
'«Ò°ª¶§¤§]¸Ç¤¼¡A®Ó¬Ó¦Ò¤ê§B±e¡Q',
'Äá´£s¤_©s³µ¤¼¡A±©©°±G§^¥H°¡Q',
)),
'big5-hkscs'=> (join('',
'«Ò°ª¶§¤§]¸Ç¤¼¡A®Ó¬Ó¦Ò¤ê§B±e¡Q',
'Äá´£s¤_©s³µ¤¼¡A±©©°±G§^¥H°¡Q',
)),
'cp950' => (join('',
'«Ò°ª¶§¤§]¸Ç¤¼¡A®Ó¬Ó¦Ò¤ê§B±e¡Q',
'Äá´£s¤_©s³µ¤¼¡A±©©°±G§^¥H°¡Q',
)),
});
run_tests('Hong Kong Extensions', {
'utf' => (
24863.35613.25152.26377.20351.29992.32.80.101.114.108.32.
22021.26379.21451.65292.32102.25105.21707.22021.
25903.25345.12289.24847.35211.21644.40723.21237.
22914.26524.32232.30908.26377.20219.20309.37679.28431.
65292.35531.21578.35380.25105.21707.12290
),
'big5-hkscs' => join('',
'·PÁ©Ҧ³¨Ï¥Î Perl ïªB¤Í¡Aµ¹§Ú]ï¤ä«ù¡B·N¨£©M¹ªÀy',
'¦pªG½s½X¦³¥ô¦ó¿ùº|¡A½Ð§i¶D§Ú]¡C'
),
});
sub run_tests {
my ($title, $tests) = @_;
my $utf = delete $tests->{'utf'};
# $enc = encoding, $str = content
foreach my $enc (sort keys %{$tests}) {
my $str = $tests->{$enc};
is(Encode::decode($enc, $str), $utf, "[$enc] decode - $title");
is(Encode::encode($enc, $utf), $str, "[$enc] encode - $title");
my $str2 = $str;
my $utf8 = Encode::encode('utf-8', $utf);
Encode::from_to($str2, $enc, 'utf-8');
is($str2, $utf8, "[$enc] from_to => utf8 - $title");
Encode::from_to($utf8, 'utf-8', $enc); # convert $utf8 as $enc
is($utf8, $str, "[$enc] utf8 => from_to - $title");
}
}
--- NEW FILE: jisx0208.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: enc_data.t ---
# $Id: enc_data.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
if ($] <= 5.008 and !$Config{perl_patchlevel}){
print "1..0 # Skip: Perl 5.8.1 or later required\n";
exit 0;
}
}
use strict;
use encoding 'euc-jp';
use Test::More tests => 4;
my @a;
while (<DATA>) {
chomp;
tr/¤¡-¤ó¥¡-¥ó/¥¡-¥ó¤¡-¤ó/;
push @a, $_;
}
is(scalar @a, 3);
is($a[0], "¥³¥ì¥ÏDATA¤Õ¤¡¤¤¤ë¤Ï¤ó¤É¤ë¥Î¤Æ¤¹¤È¥Ç¥¹¡£");
is($a[1], "ÆüËܸ쥬¥Á¥ã¥ó¥ÈÊÑ´¹¥Ç¥¥ë¥«");
is($a[2], "¥É¥¦¥«¥Î¤Æ¤¹¤È¥ò¥·¥Æ¥¤¥Þ¥¹¡£");
__DATA__
¤³¤ì¤ÏDATA¥Õ¥¡¥¤¥ë¥Ï¥ó¥É¥ë¤Î¥Æ¥¹¥È¤Ç¤¹¡£
ÆüËܸ줬¤Á¤ã¤ó¤ÈÊÑ´¹¤Ç¤¤ë¤«
¤É¤¦¤«¤Î¥Æ¥¹¥È¤ò¤·¤Æ¤¤¤Þ¤¹¡£
--- NEW FILE: big5-hkscs.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: utf8strict.t ---
#!../perl
our $DEBUG = @ARGV;
our (%ORD, %SEQ, $NTESTS);
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if ($] <= 5.008 and !$Config{perl_patchlevel}){
print "1..0 # Skip: Perl 5.8.1 or later required\n";
exit 0;
}
# http://smontagu.damowmow.com/utf8test.html
%ORD = (
0x00000080 => 0, # 2.1.2
0x00000800 => 0, # 2.1.3
0x00010000 => 0, # 2.1.4
0x00200000 => 1, # 2.1.5
0x00400000 => 1, # 2.1.6
0x0000007F => 0, # 2.2.1 -- unmapped okay
0x000007FF => 0, # 2.2.2
0x0000FFFF => 1, # 2.2.3
0x001FFFFF => 1, # 2.2.4
0x03FFFFFF => 1, # 2.2.5
0x7FFFFFFF => 1, # 2.2.6
0x0000D800 => 1, # 5.1.1
0x0000DB7F => 1, # 5.1.2
0x0000D880 => 1, # 5.1.3
0x0000DBFF => 1, # 5.1.4
0x0000DC00 => 1, # 5.1.5
0x0000DF80 => 1, # 5.1.6
0x0000DFFF => 1, # 5.1.7
# 5.2 "Paird UTF-16 surrogates skipped
# because utf-8-strict raises exception at the first one
0x0000FFFF => 1, # 5.3.1
);
$NTESTS += scalar keys %ORD;
%SEQ = (
qq/ed 9f bf/ => 0, # 2.3.1
qq/ee 80 80/ => 0, # 2.3.2
qq/f4 8f bf bf/ => 0, # 2.3.3
qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG
# "3 Malformed sequences" are checked by perl.
# "4 Overlong sequences" are checked by perl.
);
$NTESTS += scalar keys %SEQ;
}
use strict;
use Encode;
use utf8;
use Test::More tests => $NTESTS;
local($SIG{__WARN__}) = sub { $DEBUG and $@ and print STDERR $@ };
my $d = find_encoding("utf-8-strict");
for my $u (sort keys %ORD){
my $c = chr($u);
eval { $d->encode($c,1) };
$DEBUG and $@ and warn $@;
my $t = $@ ? 1 : 0;
is($t, $ORD{$u}, sprintf "U+%04X", $u);
}
for my $s (sort keys %SEQ){
my $o = pack "C*" => map {hex} split /\s+/, $s;
eval { $d->decode($o,1) };
$DEBUG and $@ and warn $@;
my $t = $@ ? 1 : 0;
is($t, $SEQ{$s}, $s);
}
__END__
--- NEW FILE: enc_utf8.t ---
# $Id: enc_utf8.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
# This is the twin of enc_eucjp.t .
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
}
use encoding 'utf8';
my @c = (127, 128, 255, 256);
print "1.." . (scalar @c + 1) . "\n";
my @f;
for my $i (0..$#c) {
my $file = filename("f$i");
push @f, $file;
open(F, ">$file") or die "$0: failed to open '$file' for writing: $!";
binmode(F, ":utf8");
print F chr($c[$i]);
close F;
}
my $t = 1;
for my $i (0..$#c) {
my $file = filename("f$i");
open(F, "<$file") or die "$0: failed to open '$file' for reading: $!";
binmode(F, ":utf8");
my $c = <F>;
my $o = ord($c);
print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n";
$t++;
}
my $f = filename("f" . @f);
push @f, $f;
open(F, ">$f") or die "$0: failed to open '$f' for writing: $!";
binmode(F, ":raw"); # Output raw bytes.
print F chr(128); # Output illegal UTF-8.
close F;
open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf-8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
eval { <F> }; # This should get caught.
}
close F;
print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
"ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
# On VMS temporary file names like "f0." may be more readable than "f0" since
# "f0" could be a logical name pointing elsewhere.
sub filename {
my $name = shift;
$name .= '.' if $^O eq 'VMS';
return $name;
}
END {
1 while unlink @f;
}
--- NEW FILE: gsm0338.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
$| = 1;
}
use strict;
use Test::More tests => 21;
use Encode;
# The specification of GSM 03.38 is not awfully clear.
# (http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT)
# The various combinations of 0x00 and 0x1B as leading bytes
# are unclear, as is the semantics of those bytes as standalone
# or as final single bytes.
sub t { is(decode("gsm0338", my $t = $_[0]), $_[1]) }
# t("\x00", "\x00"); # ???
# "Round-trip".
t("\x41", "\x41");
t("\x01", "\xA3");
t("\x02", "\x24");
t("\x03", "\xA5");
t("\x09", "\xE7");
t("\x00\x00", "\x00\x00"); # Maybe?
t("\x00\x1B", "\x40\xA0"); # Maybe?
t("\x00\x41", "\x40\x41");
# t("\x1B", "\x1B"); # ???
# Escape with no special second byte is just a NBSP.
t("\x1B\x41", "\xA0\x41");
t("\x1B\x00", "\xA0\x40"); # Maybe?
# Special escape characters.
t("\x1B\x0A", "\x0C");
t("\x1B\x14", "\x5E");
t("\x1B\x28", "\x7B");
t("\x1B\x29", "\x7D");
t("\x1B\x2F", "\x5C");
t("\x1B\x3C", "\x5B");
t("\x1B\x3D", "\x7E");
t("\x1B\x3E", "\x5D");
t("\x1B\x40", "\x7C");
t("\x1B\x40", "\x7C");
t("\x1B\x65", "\x{20AC}");
--- NEW FILE: big5-eten.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: guess.t ---
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
use strict;
use File::Basename;
use File::Spec;
use Encode qw(decode encode find_encoding _utf8_off);
#use Test::More qw(no_plan);
use Test::More tests => 29;
use_ok("Encode::Guess");
{
no warnings;
$Encode::Guess::DEBUG = shift || 0;
}
my $ascii = join('' => map {chr($_)}(0x21..0x7e));
my $latin1 = join('' => map {chr($_)}(0xa1..0xfe));
my $utf8on = join('' => map {chr($_)}(0x3000..0x30fe));
my $utf8off = $utf8on; _utf8_off($utf8off);
my $utf16 = encode('UTF-16', $utf8on);
my $utf32 = encode('UTF-32', $utf8on);
is(guess_encoding($ascii)->name, 'ascii', 'ascii');
like(guess_encoding($latin1), qr/No appropriate encoding/io, 'no ascii');
is(guess_encoding($latin1, 'latin1')->name, 'iso-8859-1', 'iso-8859-1');
is(guess_encoding($utf8on)->name, 'utf8', 'utf8 w/ flag');
is(guess_encoding($utf8off)->name, 'utf8', 'utf8 w/o flag');
is(guess_encoding($utf16)->name, 'UTF-16', 'UTF-16');
is(guess_encoding($utf32)->name, 'UTF-32', 'UTF-32');
my $jisx0201 = File::Spec->catfile(dirname(__FILE__), 'jisx0201.utf');
my $jisx0208 = File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf');
my $jisx0212 = File::Spec->catfile(dirname(__FILE__), 'jisx0212.utf');
open my $fh, $jisx0208 or die "$jisx0208: $!";
binmode($fh);
$utf8off = join('' => <$fh>);
close $fh;
$utf8on = decode('utf8', $utf8off);
my @jp = qw(7bit-jis shiftjis euc-jp);
Encode::Guess->set_suspects(@jp);
for my $jp (@jp){
my $test = encode($jp, $utf8on);
is(guess_encoding($test)->name, $jp, "JP:$jp");
}
is (decode('Guess', encode('euc-jp', $utf8on)), $utf8on, "decode('Guess')");
eval{ encode('Guess', $utf8on) };
like($@, qr/not defined/io, "no encode()");
my %CJKT =
(
'euc-cn' => File::Spec->catfile(dirname(__FILE__), 'gb2312.utf'),
'euc-jp' => File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'),
'euc-kr' => File::Spec->catfile(dirname(__FILE__), 'ksc5601.utf'),
'big5-eten' => File::Spec->catfile(dirname(__FILE__), 'big5-eten.utf'),
);
Encode::Guess->set_suspects(keys %CJKT);
for my $name (keys %CJKT){
open my $fh, $CJKT{$name} or die "$CJKT{$name}: $!";
binmode($fh);
$utf8off = join('' => <$fh>);
close $fh;
my $test = encode($name, decode('utf8', $utf8off));
is(guess_encoding($test)->name, $name, "CJKT:$name");
}
my $ambiguous = "\x{5c0f}\x{98fc}\x{5f3e}";
my $english = "The quick brown fox jumps over the black lazy dog.";
for my $utf (qw/UTF-16 UTF-32/){
for my $bl (qw/BE LE/){
my $test = encode("$utf$bl" => $english);
is(guess_encoding($test)->name, "$utf$bl", "$utf$bl");
}
}
for my $bl (qw/BE LE/){
my $test = encode("UTF-16$bl" => $ambiguous);
my $result = guess_encoding($test);
ok(! ref($result), "UTF-16$bl:$result");
}
Encode::Guess->set_suspects();
for my $jp (@jp){
# intentionally set $1 a priori -- see Changes
my $test = "English";
'$1' =~ m/^(.*)/o;
is(guess_encoding($test, ($jp))->name, 'ascii',
"ascii vs $jp (\$1 messed)");
$test = encode($jp, $test . "\n\x{65e5}\x{672c}\x{8a9e}");
is(guess_encoding($test, ($jp))->name,
$jp, "$jp vs ascii (\$1 messed)");
}
__END__;
--- NEW FILE: unibench.pl ---
#!./perl
use strict;
use Encode;
use Benchmark qw(:all);
my $Count = shift @ARGV;
$Count ||= 16;
my @sizes = @ARGV || (1, 4, 16);
my %utf8_seed;
for my $i (0x00..0xff){
my $c = chr($i);
$utf8_seed{BMP} .= ($c =~ /^\p{IsPrint}/o) ? $c : " ";
}
utf8::upgrade($utf8_seed{BMP});
for my $i (0x00..0xff){
my $c = chr(0x10000+$i);
$utf8_seed{HIGH} .= ($c =~ /^\p{IsPrint}/o) ? $c : " ";
}
utf8::upgrade($utf8_seed{HIGH});
my %S;
for my $i (@sizes){
my $sz = 256 * $i;
for my $cp (qw(BMP HIGH)){
$S{utf8}{$sz}{$cp} = $utf8_seed{$cp} x $i;
$S{utf16}{$sz}{$cp} = encode('UTF-16BE', $S{utf8}{$sz}{$cp});
}
}
for my $i (@sizes){
my $sz = $i * 256;
my $count = $Count * int(256/$i);
for my $cp (qw(BMP HIGH)){
for my $op (qw(encode decode)){
my ($meth, $from, $to) = ($op eq 'encode') ?
(\&encode, 'utf8', 'utf16') : (\&decode, 'utf16', 'utf8');
my $XS = sub {
Encode::Unicode::set_transcoder("xs");
$meth->('UTF-16BE', $S{$from}{$sz}{$cp})
eq $S{$to}{$sz}{$cp}
or die "$op,$from,$to,$sz,$cp";
};
my $modern = sub {
Encode::Unicode::set_transcoder("modern");
$meth->('UTF-16BE', $S{$from}{$sz}{$cp})
eq $S{$to}{$sz}{$cp}
or die "$op,$from,$to,$sz,$cp";
};
my $classic = sub {
Encode::Unicode::set_transcoder("classic");
$meth->('UTF-16BE', $S{$from}{$sz}{$cp})
eq $S{$to}{$sz}{$cp} or
die "$op,$from,$to,$sz,$cp";
};
print "---- $op length=$sz/range=$cp ----\n";
my $r = timethese($count,
{
"XS" => $XS,
"Modern" => $modern,
"Classic" => $classic,
},
'none',
);
cmpthese($r);
}
}
}
--- NEW FILE: jisx0212.enc ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: ksc5601.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: jisx0201.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: Aliases.t ---
#!../perl
BEGIN {
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
}
use strict;
use Encode;
use Encode::Alias;
my %a2c;
my $ON_EBCDIC;
sub init_a2c{
%a2c = (
'US-ascii' => 'ascii',
'ISO-646-US' => 'ascii',
'UTF-8' => 'utf-8-strict',
'UCS-2' => 'UCS-2BE',
'UCS2' => 'UCS-2BE',
'iso-10646-1' => 'UCS-2BE',
'ucs2-le' => 'UCS-2LE',
'ucs2-be' => 'UCS-2BE',
'utf16' => 'UTF-16',
'utf32' => 'UTF-32',
'utf16-be' => 'UTF-16BE',
'utf32-be' => 'UTF-32BE',
'utf16-le' => 'UTF-16LE',
'utf32-le' => 'UTF-32LE',
'UCS4-BE' => 'UTF-32BE',
'UCS-4-LE' => 'UTF-32LE',
'cyrillic' => 'iso-8859-5',
'arabic' => 'iso-8859-6',
'greek' => 'iso-8859-7',
'hebrew' => 'iso-8859-8',
'thai' => 'iso-8859-11',
'tis620' => 'iso-8859-11',
'WinLatin1' => 'cp1252',
'WinLatin2' => 'cp1250',
'WinCyrillic' => 'cp1251',
'WinGreek' => 'cp1253',
'WinTurkish' => 'cp1254',
'WinHebrew' => 'cp1255',
'WinArabic' => 'cp1256',
'WinBaltic' => 'cp1257',
'WinVietnamese' => 'cp1258',
'koi8r' => 'koi8-r',
'koi8u' => 'koi8-u',
'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp',
'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp',
'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn',
'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn',
'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr',
'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr',
'ujis' => $ON_EBCDIC ? '' : 'euc-jp',
'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis',
'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis',
'jis' => $ON_EBCDIC ? '' : '7bit-jis',
'big-5' => $ON_EBCDIC ? '' : 'big5-eten',
'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5-eten',
'tca-big5' => $ON_EBCDIC ? '' : 'big5-eten',
'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs',
'hkscs-big5' => $ON_EBCDIC ? '' : 'big5-hkscs',
'GB_2312-80' => $ON_EBCDIC ? '' : 'euc-cn',
'KS_C_5601-1987' => $ON_EBCDIC ? '' : 'cp949',
#
'gb12345-raw' => $ON_EBCDIC ? '' : 'gb12345-raw',
'gb2312-raw' => $ON_EBCDIC ? '' : 'gb2312-raw',
'jis0201-raw' => $ON_EBCDIC ? '' : 'jis0201-raw',
'jis0208-raw' => $ON_EBCDIC ? '' : 'jis0208-raw',
'jis0212-raw' => $ON_EBCDIC ? '' : 'jis0212-raw',
'ksc5601-raw' => $ON_EBCDIC ? '' : 'ksc5601-raw',
);
for my $i (1..11,13..16){
$a2c{"ISO 8859 $i"} = "iso-8859-$i";
}
for my $i (1..10){
$a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
}
for my $k (keys %Encode::Alias::Winlatin2cp){
my $v = $Encode::Alias::Winlatin2cp{$k};
$a2c{"Win" . ucfirst($k)} = "cp" . $v;
$a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v;
$a2c{"cp-" . $v} = "cp" . $v;
}
my @a2c = keys %a2c;
for my $k (@a2c){
$a2c{uc($k)} = $a2c{$k};
$a2c{lc($k)} = $a2c{$k};
$a2c{lcfirst($k)} = $a2c{$k};
$a2c{ucfirst($k)} = $a2c{$k};
}
}
BEGIN{
$ON_EBCDIC = ord("A") == 193;
@ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC';
$Encode::ON_EBCDIC = $ON_EBCDIC;
init_a2c();
}
if ($ON_EBCDIC){
delete @Encode::ExtModule{
qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165 MacChineseSimp
euc-jp iso-2022-jp 7bit-jis shiftjis MacJapanese cp932
euc-kr ksc5601 cp949 MacKorean
big5 big5-hkscs cp950 MacChineseTrad
gb18030 big5plus euc-tw)
};
}
use Test::More tests => (scalar keys %a2c) * 4;
print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n";
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
is((defined($e) and $e->name), $a2c{$a},$a)
or warn "alias was $a";;
}
# now we override some of the aliases and see if it works fine
define_alias(
qr/ascii/i => 'WinLatin1',
qr/cyrillic/i => 'WinCyrillic',
qr/arabic/i => 'WinArabic',
qr/greek/i => 'WinGreek',
qr/hebrew/i => 'WinHebrew'
);
print "# alias test with alias overrides\n";
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
is((defined($e) and $e->name), $a2c{$a}, "Override $a")
or warn "alias was $a";
}
print "# alias undef test\n";
Encode::Alias->undef_aliases;
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
ok(!defined($e) || $e->name =~ /-raw$/o,"Undef $a")
or warn "alias was $a";
}
print "# alias reinit test\n";
Encode::Alias->init_aliases;
init_a2c();
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
is((defined($e) and $e->name), $a2c{$a}, "Reinit $a")
or warn "alias was $a";
}
__END__
for my $k (keys %a2c){
$k =~ /[A-Z]/ and next;
print "$k => $a2c{$k}\n";
}
--- NEW FILE: big5-eten.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: encoding.t ---
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
unless (find PerlIO::Layer 'perlio') {
print "1..0 # Skip: PerlIO was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # encoding pragma does not support EBCDIC platforms\n";
exit(0);
}
}
print "1..31\n";
use encoding "latin1"; # ignored (overwritten by the next line)
use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...)
# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
$a = "\xDF";
$b = "\x{100}";
print "not " unless ord($a) == 0x3af;
print "ok 1\n";
print "not " unless ord($b) == 0x100;
print "ok 2\n";
my $c;
$c = $a . $b;
print "not " unless ord($c) == 0x3af;
print "ok 3\n";
print "not " unless length($c) == 2;
print "ok 4\n";
print "not " unless ord(substr($c, 1, 1)) == 0x100;
print "ok 5\n";
print "not " unless ord(chr(0xdf)) == 0x3af; # spooky
print "ok 6\n";
print "not " unless ord(pack("C", 0xdf)) == 0x3af;
print "ok 7\n";
# we didn't break pack/unpack, I hope
print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf;
print "ok 8\n";
# the first octet of UTF-8 encoded 0x3af
print "not " unless unpack("C", chr(0xdf)) == 0xce;
print "ok 9\n";
print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf;
print "ok 10\n";
print "not " unless unpack("U", chr(0xdf)) == 0x3af;
print "ok 11\n";
# charnames must still work
use charnames ':full';
print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf;
print "ok 12\n";
# combine
$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf);
print "not " unless ord($c) == 0x3af;
print "ok 13\n";
print "not " unless ord(substr($c, 1, 1)) == 0xdf;
print "ok 14\n";
print "not " unless ord(substr($c, 2, 1)) == 0x3af;
print "ok 15\n";
# regex literals
print "not " unless "\xDF" =~ /\x{3AF}/;
print "ok 16\n";
print "not " unless "\x{3AF}" =~ /\xDF/;
print "ok 17\n";
print "not " unless "\xDF" =~ /\xDF/;
print "ok 18\n";
print "not " unless "\x{3AF}" =~ /\x{3AF}/;
print "ok 19\n";
# eq, cmp
my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = (
pack("C*", 0xDF ), # byte
pack("C*", 0xDF, 0x20), # ($bytes2 cmp $U) > 0
pack("U*", 0x3AF), # $U eq $byte
pack("U*", 0xDF ), # $Ub would eq $bytev w/o use encoding
pack("U*", 0x3B1), # ($g1 cmp $byte) > 0; === chr(0xe1)
pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0;
pack("U*", 0x3AB), # ($l cmp $byte) < 0; === chr(0xdb)
);
# all the tests in this section that compare a byte encoded string
# ato UTF-8 encoded are run in all possible vairants
# all of the eq, ne, cmp operations tested,
# $v z $u tested as well as $u z $v
sub alleq($$){
my ($a,$b) = (shift, shift);
$a eq $b && $b eq $a &&
!( $a ne $b ) && !( $b ne $a ) &&
( $a cmp $b ) == 0 && ( $b cmp $a ) == 0;
}
sub anyeq($$){
my ($a,$b) = (shift, shift);
$a eq $b || $b eq $a ||
!( $a ne $b ) || !( $b ne $a ) ||
( $a cmp $b ) == 0 || ( $b cmp $a ) == 0;
}
sub allgt($$){
my ($a,$b) = (shift, shift);
( $a cmp $b ) == 1 && ( $b cmp $a ) == -1;
}
#match the correct UTF-8 string
print "not " unless alleq($byte, $U);
print "ok 20\n";
#do not match a wrong UTF-8 string
print "not " if anyeq($byte, $Ub);
print "ok 21\n";
#string ordering
print "not " unless allgt ( $g1, $byte ) &&
allgt ( $g2, $byte ) &&
allgt ( $byte, $l ) &&
allgt ( $bytes, $U );
print "ok 22\n";
# upgrade, downgrade
my ($u,$v,$v2);
$u = $v = $v2 = pack("C*", 0xDF);
utf8::upgrade($v); #explicit upgrade
$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade
# implicit upgrade === explicit upgrade
print "not " if do{{use bytes; $v ne $v2}} || $v ne $v2;
print "ok 23\n";
# utf8::upgrade is transparent and does not break equality
print "not " unless alleq( $u, $v );
print "ok 24\n";
$u = $v = pack("C*", 0xDF);
utf8::upgrade($v);
#test for a roundtrip, we should get back from where we left
eval {utf8::downgrade( $v )};
print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v;
print "ok 25\n";
# some more eq, cmp
$byte=pack("C*", 0xDF);
print "not " unless pack("U*", 0x3AF) eq $byte;
print "ok 26\n";
print "not " if chr(0xDF) cmp $byte;
print "ok 27\n";
print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) &&
((pack("U*", 0x3AE) cmp $byte) == -1) &&
((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) &&
((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1);
print "ok 28\n";
{
# Used to core dump in 5.7.3
no warnings; # so test goes noiselessly
print ord(undef) == 0 ? "ok 29\n" : "not ok 29\n";
}
{
my %h1;
my %h2;
$h1{"\xdf"} = 41;
$h2{"\x{3af}"} = 42;
print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n";
print $h2{"\xdf"} == 42 ? "ok 31\n" : "not ok 31\n";
}
--- NEW FILE: gb2312.utf ---
(This appears to be a binary file; contents omitted.)
--- NEW FILE: Unicode.t ---
#
# $Id: Unicode.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
#
# This script is written entirely in ASCII, even though quoted literals
# do include non-BMP unicode characters -- Are you happy, jhi?
#
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}
use strict;
#use Test::More 'no_plan';
use Test::More tests => 37;
use Encode qw(encode decode);
#
# see
# http://www.unicode.org/unicode/reports/tr19/
#
my $dankogai = "\x{5c0f}\x{98fc}\x{3000}\x{5f3e}";
my $nasty = "$dankogai\x{1abcd}";
my $fallback = "$dankogai\x{fffd}";
#hi: (0x1abcd - 0x10000) / 0x400 + 0xD800 = 0xd82a
#lo: (0x1abcd - 0x10000) % 0x400 + 0xDC00 = 0xdfcd
my $n_16be =
pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e d8 2a df cd>);
my $n_16le =
pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f 2a d8 cd df>);
my $f_16be =
pack("C*", map {hex($_)} qw<5c 0f 98 fc 30 00 5f 3e ff fd>);
my $f_16le =
pack("C*", map {hex($_)} qw<0f 5c fc 98 00 30 3e 5f fd ff>);
my $n_32be =
pack("C*", map {hex($_)}
qw<00 00 5c 0f 00 00 98 fc 00 00 30 00 00 00 5f 3e 00 01 ab cd>);
my $n_32le =
pack("C*", map {hex($_)}
qw<0f 5c 00 00 fc 98 00 00 00 30 00 00 3e 5f 00 00 cd ab 01 00>);
my $n_16bb = pack('n', 0xFeFF) . $n_16be;
my $n_16lb = pack('v', 0xFeFF) . $n_16le;
my $n_32bb = pack('N', 0xFeFF) . $n_32be;
my $n_32lb = pack('V', 0xFeFF) . $n_32le;
is($n_16be, encode('UTF-16BE', $nasty), qq{encode UTF-16BE});
is($n_16le, encode('UTF-16LE', $nasty), qq{encode UTF-16LE});
is($n_32be, encode('UTF-32BE', $nasty), qq{encode UTF-32BE});
is($n_32le, encode('UTF-32LE', $nasty), qq{encode UTF-16LE});
is($nasty, decode('UTF-16BE', $n_16be), qq{decode UTF-16BE});
is($nasty, decode('UTF-16LE', $n_16le), qq{decode UTF-16LE});
is($nasty, decode('UTF-32BE', $n_32be), qq{decode UTF-32BE});
is($nasty, decode('UTF-32LE', $n_32le), qq{decode UTF-32LE});
is($n_16bb, encode('UTF-16', $nasty), qq{encode UTF-16});
is($n_32bb, encode('UTF-32', $nasty), qq{encode UTF-32});
is($nasty, decode('UTF-16', $n_16bb), qq{decode UTF-16, bom=be});
is($nasty, decode('UTF-16', $n_16lb), qq{decode UTF-16, bom=le});
is($nasty, decode('UTF-32', $n_32bb), qq{decode UTF-32, bom=be});
is($nasty, decode('UTF-32', $n_32lb), qq{decode UTF-32, bom=le});
is(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback");
is(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback");
eval { decode('UCS-2BE', $n_16be, 1) };
is (index($@,'UCS-2BE:'), 0, "decode UCS-2BE: exception");
eval { decode('UCS-2LE', $n_16le, 1) };
is (index($@,'UCS-2LE:'), 0, "decode UCS-2LE: exception");
is(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback");
is(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback");
eval { encode('UCS-2BE', $nasty, 1) };
is(index($@, 'UCS-2BE'), 0, "encode UCS-2BE: exception");
eval { encode('UCS-2LE', $nasty, 1) };
is(index($@, 'UCS-2LE'), 0, "encode UCS-2LE: exception");
#
# SvGROW test for (en|de)code_xs
#
SKIP: {
my $utf8 = '';
for my $j (0,0x10){
for my $i (0..0xffff){
$j == 0 and (0xD800 <= $i && $i <= 0xDFFF) and next;
$utf8 .= ord($j+$i);
}
for my $major ('UTF-16', 'UTF-32'){
for my $minor ('BE', 'LE'){
my $enc = $major.$minor;
is(decode($enc, encode($enc, $utf8)), $utf8, "$enc RT");
}
}
}
};
#
# CJKT vs. UTF-7
#
use File::Spec;
use File::Basename;
my $dir = dirname(__FILE__);
opendir my $dh, $dir or die "$dir:$!";
my @file = sort grep {/\.utf$/o} readdir $dh;
closedir $dh;
for my $file (@file){
my $path = File::Spec->catfile($dir, $file);
open my $fh, '<', $path or die "$path:$!";
my $content;
if (PerlIO::Layer->find('perlio')){
binmode $fh => ':utf8';
$content = join('' => <$fh>);
}else{ # ugh!
binmode $fh;
$content = join('' => <$fh>);
Encode::_utf8_on($content)
}
close $fh;
is(decode("UTF-7", encode("UTF-7", $content)), $content,
"UTF-7 RT:$file");
}
1;
__END__
--- NEW FILE: Encoder.t ---
#
# $Id: Encoder.t,v 1.1 2006-12-05 04:26:37 dslinux_cayenne Exp $
#
BEGIN {
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
$| = 1;
}
use strict;
#use Test::More 'no_plan';
use Test::More tests => 516;
use Encode::Encoder qw(encoder);
use MIME::Base64;
package Encode::Base64;
use base 'Encode::Encoding';
__PACKAGE__->Define('base64');
use MIME::Base64;
sub encode{
my ($obj, $data) = @_;
return encode_base64($data);
}
sub decode{
my ($obj, $data) = @_;
return decode_base64($data);
}
package main;
my $e = encoder("foo", "ascii");
ok ($e->data("bar"));
is ($e->data, "bar");
ok ($e->encoding("latin1"));
is ($e->encoding, "iso-8859-1");
my $data = '';
for my $i (0..255){
no warnings;
$data .= chr($i);
my $base64 = encode_base64($data);
is(encoder($data)->base64, $base64, "encode");
is(encoder($base64)->bytes('base64'), $data, "decode");
}
1;
__END__
More information about the dslinux-commit
mailing list