dslinux/user/perl/ext/Unicode/Normalize Changes Makefile.PL Normalize.pm Normalize.xs README mkheader

cayenne dslinux_cayenne at user.in-berlin.de
Tue Dec 5 05:26:59 CET 2006


Update of /cvsroot/dslinux/dslinux/user/perl/ext/Unicode/Normalize
In directory antilope:/tmp/cvs-serv7729/ext/Unicode/Normalize

Added Files:
	Changes Makefile.PL Normalize.pm Normalize.xs README mkheader 
Log Message:
Adding fresh perl source to HEAD to branch from

--- NEW FILE: Makefile.PL ---
require 5.006001;
use ExtUtils::MakeMaker;

my $clean = {};

if (-f "Normalize.xs") {
    print STDERR "Making header files for XS...\n";

    do 'mkheader' or die $@ || "mkheader: $!";

    $clean = { FILES => 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h' };
}

WriteMakefile(
    'INSTALLDIRS'	=> $] >= 5.007002 ? 'perl' : 'site',
    'NAME'		=> 'Unicode::Normalize',
    'VERSION_FROM'	=> 'Normalize.pm', # finds $VERSION
    'clean'		=> $clean,
    'PREREQ_PM'	  	=> {
	Carp		=> 0,
	constant	=> 0,
	DynaLoader	=> 0,
	Exporter	=> 0,
	File::Copy	=> 0,
	File::Spec	=> 0,
	strict		=> 0,
	Test		=> 0,
	warnings	=> 0,
    },
);

--- NEW FILE: Normalize.xs ---

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"

/* These 5 files are prepared by mkheader */
#include "unfcmb.h"
#include "unfcan.h"
#include "unfcpt.h"
#include "unfcmp.h"
#include "unfexc.h"

/* Perl 5.6.1 ? */
#ifndef uvuni_to_utf8
#define uvuni_to_utf8   uv_to_utf8
#endif /* uvuni_to_utf8 */

/* Perl 5.6.1 ? */
#ifndef utf8n_to_uvuni
#define utf8n_to_uvuni  utf8_to_uv
#endif /* utf8n_to_uvuni */

/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
#ifdef UTF8_ALLOW_BOM
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
#else
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
#endif

/* if utf8n_to_uvuni() sets retlen to 0 (?) */
#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"

/* utf8_hop() hops back before start. Maybe broken UTF-8 */
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"

/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX    (0x10ffff)
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))

/* HANGUL_H */
#define Hangul_SBase  0xAC00
#define Hangul_SFinal 0xD7A3
#define Hangul_SCount  11172

#define Hangul_NCount    588

#define Hangul_LBase  0x1100
#define Hangul_LFinal 0x1112
#define Hangul_LCount     19

#define Hangul_VBase  0x1161
#define Hangul_VFinal 0x1175
#define Hangul_VCount     21

#define Hangul_TBase  0x11A7
#define Hangul_TFinal 0x11C2
#define Hangul_TCount     28

#define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
#define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
#define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
#define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
#define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))
/* HANGUL_H */

/* this is used for canonical ordering of combining characters (c.c.). */
typedef struct {
    U8 cc;	/* combining class */
    UV uv;	/* codepoint */
    STRLEN pos; /* position */
} UNF_cc;

static int compare_cc (const void *a, const void *b)
{
    int ret_cc;
    ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
    if (ret_cc)
	return ret_cc;

    return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}

static U8* dec_canonical (UV uv)
{
    U8 ***plane, **row;
    if (OVER_UTF_MAX(uv))
	return NULL;
    plane = (U8***)UNF_canon[uv >> 16];
    if (! plane)
	return NULL;
    row = plane[(uv >> 8) & 0xff];
    return row ? row[uv & 0xff] : NULL;
}

static U8* dec_compat (UV uv)
{
    U8 ***plane, **row;
    if (OVER_UTF_MAX(uv))
	return NULL;
    plane = (U8***)UNF_compat[uv >> 16];
    if (! plane)
	return NULL;
    row = plane[(uv >> 8) & 0xff];
    return row ? row[uv & 0xff] : NULL;
}

static UV composite_uv (UV uv, UV uv2)
{
    UNF_complist ***plane, **row, *cell, *i;

    if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
	return 0;

    if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
	uv  -= Hangul_LBase; /* lindex */
	uv2 -= Hangul_VBase; /* vindex */
	return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
    }
    if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
	uv2 -= Hangul_TBase; /* tindex */
	return(uv + uv2);
    }
    plane = UNF_compos[uv >> 16];
    if (! plane)
	return 0;
    row = plane[(uv >> 8) & 0xff];
    if (! row)
	return 0;
    cell = row[uv & 0xff];
    if (! cell)
	return 0;
    for (i = cell; i->nextchar; i++) {
	if (uv2 == i->nextchar)
	    return i->composite;
    }
    return 0;
}

static U8 getCombinClass (UV uv)
{
    U8 **plane, *row;
    if (OVER_UTF_MAX(uv))
	return 0;
    plane = (U8**)UNF_combin[uv >> 16];
    if (! plane)
	return 0;
    row = plane[(uv >> 8) & 0xff];
    return row ? row[uv & 0xff] : 0;
}

static void sv_cat_decompHangul (SV* sv, UV uv)
{
    UV sindex, lindex, vindex, tindex;
    U8 *t, tmp[3 * UTF8_MAXLEN + 1];

    if (! Hangul_IsS(uv))
	return;

    sindex =  uv - Hangul_SBase;
    lindex =  sindex / Hangul_NCount;
    vindex = (sindex % Hangul_NCount) / Hangul_TCount;
    tindex =  sindex % Hangul_TCount;

    t = tmp;
    t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
    t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
    if (tindex)
	t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
    *t = '\0';
    sv_catpvn(sv, (char *)tmp, t - tmp);
}

static void sv_cat_uvuni (SV* sv, UV uv)
{
    U8 *t, tmp[UTF8_MAXLEN + 1];

    t = tmp;
    t = uvuni_to_utf8(t, uv);
    *t = '\0';
    sv_catpvn(sv, (char *)tmp, t - tmp);
}

static char * sv_2pvunicode(SV *sv, STRLEN *lp)
{
    char *s;
    STRLEN len;
    s = (char*)SvPV(sv,len);
    if (!SvUTF8(sv)) {
	SV* tmpsv = sv_mortalcopy(sv);
	if (!SvPOK(tmpsv))
	    (void)sv_pvn_force(tmpsv,&len);
	sv_utf8_upgrade(tmpsv);
	s = (char*)SvPV(tmpsv,len);
    }
    *lp = len;
    return s;
}

MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize

SV*
decompose(src, compat = &PL_sv_no)
    SV * src
    SV * compat
  PROTOTYPE: $;$
  PREINIT:
    SV *dst;
    STRLEN srclen, retlen;
    U8 *s, *e, *p, *r;
    UV uv;
    bool iscompat;
  CODE:
    iscompat = SvTRUE(compat);
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    dst = newSV(1);
    (void)SvPOK_only(dst);
    SvUTF8_on(dst);

    for (p = s; p < e; p += retlen) {
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (!retlen)
	    croak(ErrRetlenIsZero);

	if (Hangul_IsS(uv))
	    sv_cat_decompHangul(dst, uv);
	else {
	    r = iscompat ? dec_compat(uv) : dec_canonical(uv);
	    if (r)
		sv_catpv(dst, (char *)r);
	    else
		sv_cat_uvuni(dst, uv);
	}
    }
    RETVAL = dst;
  OUTPUT:
    RETVAL



SV*
reorder(src)
    SV * src
  PROTOTYPE: $
  PREINIT:
    SV *dst;
    STRLEN srclen, dstlen, retlen, stk_cc_max;
    U8 *s, *e, *p, *d, curCC;
    UV uv, uvlast;
    UNF_cc * stk_cc;
    STRLEN i, cc_pos;
    bool valid_uvlast;
  CODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    dstlen = srclen + 1;
    dst = newSV(dstlen);
    (void)SvPOK_only(dst);
    SvUTF8_on(dst);
    d = (U8*)SvPVX(dst);

    stk_cc_max = 10; /* enough as an initial value? */
    New(0, stk_cc, stk_cc_max, UNF_cc);

    for (p = s; p < e;) {
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (!retlen)
	    croak(ErrRetlenIsZero);
	p += retlen;

	curCC = getCombinClass(uv);
	if (curCC == 0) {
	    d = uvuni_to_utf8(d, uv);
	    continue;
	}

	cc_pos = 0;
	stk_cc[cc_pos].cc  = curCC;
	stk_cc[cc_pos].uv  = uv;
	stk_cc[cc_pos].pos = cc_pos;

	valid_uvlast = FALSE;
	while (p < e) {
	    uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	    if (!retlen)
		croak(ErrRetlenIsZero);
	    p += retlen;

	    curCC = getCombinClass(uv);
	    if (curCC == 0) {
		uvlast = uv;
		valid_uvlast = TRUE;
		break;
	    }

	    cc_pos++;
	    if (stk_cc_max <= cc_pos) { /* extend if need */
		stk_cc_max = cc_pos + 1;
		Renew(stk_cc, stk_cc_max, UNF_cc);
	    }
	    stk_cc[cc_pos].cc  = curCC;
	    stk_cc[cc_pos].uv  = uv;
	    stk_cc[cc_pos].pos = cc_pos;
	}

	/* reordered if there are two c.c.'s */
	if (cc_pos) {
	    qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
	}

	for (i = 0; i <= cc_pos; i++) {
	    d = uvuni_to_utf8(d, stk_cc[i].uv);
	}
	if (valid_uvlast)
	{
	    d = uvuni_to_utf8(d, uvlast);
	}
    }
    *d = '\0';
    SvCUR_set(dst, d - (U8*)SvPVX(dst));
    Safefree(stk_cc);
    RETVAL = dst;
  OUTPUT:
    RETVAL



SV*
compose(src)
    SV * src
  PROTOTYPE: $
  ALIAS:
    composeContiguous = 1
  PREINIT:
    SV  *dst, *tmp;
    U8  *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
    UV uv, uvS, uvComp;
    STRLEN srclen, dstlen, tmplen, retlen;
    bool beginning = TRUE;
  CODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    dstlen = srclen + 1;
    dst = newSV(dstlen);
    (void)SvPOK_only(dst);
    SvUTF8_on(dst);
    d = (U8*)SvPVX(dst);

  /* for uncomposed combining char */
    tmp = sv_2mortal(newSV(dstlen));
    (void)SvPOK_only(tmp);
    SvUTF8_on(tmp);

    for (p = s; p < e;) {
	if (beginning) {
	    uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	    if (!retlen)
		croak(ErrRetlenIsZero);
	    p += retlen;

            if (getCombinClass(uvS)) { /* no Starter found yet */
		d = uvuni_to_utf8(d, uvS);
		continue;
	    }
            beginning = FALSE;
	}

    /* Starter */
	t = tmp_start = (U8*)SvPVX(tmp);
	preCC = 0;

    /* to the next Starter */
	while (p < e) {
	    uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	    if (!retlen)
		croak(ErrRetlenIsZero);
	    p += retlen;

	    curCC = getCombinClass(uv);

	    if (preCC && preCC == curCC) {
		preCC = curCC;
		t = uvuni_to_utf8(t, uv);
	    } else {
		uvComp = composite_uv(uvS, uv);

		if (uvComp && ! isExclusion(uvComp) &&
			(ix ? (t == tmp_start) : (preCC <= curCC))) {
		    STRLEN leftcur, rightcur, dstcur;
		    leftcur  = UNISKIP(uvComp);
		    rightcur = UNISKIP(uvS) + UNISKIP(uv);

		    if (leftcur > rightcur) {
			dstcur = d - (U8*)SvPVX(dst);
			dstlen += leftcur - rightcur;
			d = (U8*)SvGROW(dst,dstlen) + dstcur;
		    }
		    /* preCC not changed to curCC */
		    uvS = uvComp;
		} else if (! curCC && p < e) { /* blocked */
		    break;
		} else {
		    preCC = curCC;
		    t = uvuni_to_utf8(t, uv);
		}
	    }
	}
	d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
	tmplen = t - tmp_start;
	if (tmplen) { /* uncomposed combining char */
	    t = (U8*)SvPVX(tmp);
	    while (tmplen--)
		*d++ = *t++;
	}
	uvS = uv;
    } /* for */
    *d = '\0';
    SvCUR_set(dst, d - (U8*)SvPVX(dst));
    RETVAL = dst;
  OUTPUT:
    RETVAL


void
checkNFD(src)
    SV * src
  PROTOTYPE: $
  ALIAS:
    checkNFKD = 1
  PREINIT:
    STRLEN srclen, retlen;
    U8 *s, *e, *p, curCC, preCC;
    UV uv;
  CODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    preCC = 0;
    for (p = s; p < e; p += retlen) {
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (!retlen)
	    croak(ErrRetlenIsZero);

	curCC = getCombinClass(uv);
	if (preCC > curCC && curCC != 0) /* canonical ordering violated */
	    XSRETURN_NO;
	if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
	    XSRETURN_NO;
	preCC = curCC;
    }
    XSRETURN_YES;



void
checkNFC(src)
    SV * src
  PROTOTYPE: $
  ALIAS:
    checkNFKC = 1
  PREINIT:
    STRLEN srclen, retlen;
    U8 *s, *e, *p, curCC, preCC;
    UV uv;
    bool isMAYBE;
  CODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    preCC = 0;
    isMAYBE = FALSE;
    for (p = s; p < e; p += retlen) {
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (!retlen)
	    croak(ErrRetlenIsZero);

	curCC = getCombinClass(uv);

	if (preCC > curCC && curCC != 0) /* canonical ordering violated */
	    XSRETURN_NO;

	/* get NFC/NFKC property */
	if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
	    ; /* YES */
	else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
	    XSRETURN_NO;
	else if (isComp2nd(uv))
	    isMAYBE = TRUE;
	else if (ix) {
	    char *canon, *compat;
	  /* NFKC_NO when having compatibility mapping. */
	    canon  = (char *) dec_canonical(uv);
	    compat = (char *) dec_compat(uv);
	    if (compat && !(canon && strEQ(canon, compat)))
		XSRETURN_NO;
	} /* end of get NFC/NFKC property */

	preCC = curCC;
    }
    if (isMAYBE)
	XSRETURN_UNDEF;
    else
	XSRETURN_YES;



void
checkFCD(src)
    SV * src
  PROTOTYPE: $
  ALIAS:
    checkFCC = 1
  PREINIT:
    STRLEN srclen, retlen, canlen, canret;
    U8 *s, *e, *p, curCC, preCC;
    UV uv, uvLead, uvTrail;
    U8 *sCan, *pCan, *eCan;
    bool isMAYBE;
  CODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    preCC = 0;
    isMAYBE = FALSE;
    for (p = s; p < e; p += retlen) {
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (!retlen)
	    croak(ErrRetlenIsZero);

	sCan = (U8*) dec_canonical(uv);

	if (sCan) {
	    canlen = (STRLEN)strlen((char *) sCan);
	    uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
	}
	else {
	    uvLead = uv;
	}

	curCC = getCombinClass(uvLead);

	if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
	    XSRETURN_NO;

	if (ix) {
	    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
		XSRETURN_NO;
	    else if (isComp2nd(uv))
		isMAYBE = TRUE;
	}

	if (sCan) {
	    eCan = sCan + canlen;
	    pCan = utf8_hop(eCan, -1);
	    if (pCan < sCan)
		croak(ErrHopBeforeStart);
	    uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
	    preCC = getCombinClass(uvTrail);
	}
	else {
	    preCC = curCC;
	}
    }
    if (isMAYBE)
	XSRETURN_UNDEF;
    else
	XSRETURN_YES;



U8
getCombinClass(uv)
    UV uv
  PROTOTYPE: $

bool
isExclusion(uv)
    UV uv
  PROTOTYPE: $

bool
isSingleton(uv)
    UV uv
  PROTOTYPE: $

bool
isNonStDecomp(uv)
    UV uv
  PROTOTYPE: $

bool
isComp2nd(uv)
    UV uv
  PROTOTYPE: $
  ALIAS:
    isNFC_MAYBE  = 1
    isNFKC_MAYBE = 2



void
isNFD_NO(uv)
    UV uv
  PROTOTYPE: $
  ALIAS:
    isNFKD_NO = 1
  CODE:
    if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
	XSRETURN_YES; /* NFD_NO or NFKD_NO */
    else
	XSRETURN_NO;



void
isComp_Ex(uv)
    UV uv
  PROTOTYPE: $
  ALIAS:
    isNFC_NO  = 0
    isNFKC_NO = 1
  CODE:
    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
	XSRETURN_YES; /* NFC_NO or NFKC_NO */
    else if (ix) {
	char *canon, *compat;
	canon  = (char *) dec_canonical(uv);
	compat = (char *) dec_compat(uv);
	if (compat && (!canon || strNE(canon, compat)))
	    XSRETURN_YES; /* NFC_NO or NFKC_NO */
	else
	    XSRETURN_NO;
    }
    else
	XSRETURN_NO;



SV*
getComposite(uv, uv2)
    UV uv
    UV uv2
  PROTOTYPE: $$
  PREINIT:
    UV composite;
  CODE:
    composite = composite_uv(uv, uv2);
    RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
  OUTPUT:
    RETVAL



SV*
getCanon(uv)
    UV uv
  PROTOTYPE: $
  ALIAS:
    getCompat = 1
  PREINIT:
    U8 * rstr;
  CODE:
    if (Hangul_IsS(uv)) {
	SV * dst;
	dst = newSV(1);
	(void)SvPOK_only(dst);
	sv_cat_decompHangul(dst, uv);
	RETVAL = dst;
    } else {
	rstr = ix ? dec_compat(uv) : dec_canonical(uv);
	if (!rstr)
	    XSRETURN_UNDEF;
	RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
    }
    SvUTF8_on(RETVAL);
  OUTPUT:
    RETVAL


void
splitOnLastStarter(src)
    SV * src
  PREINIT:
    SV *svp;
    STRLEN srclen, retlen;
    U8 *s, *e, *p;
    UV uv;
  PPCODE:
    s = (U8*)sv_2pvunicode(src,&srclen);
    e = s + srclen;

    for (p = e; s < p; ) {
	p = utf8_hop(p, -1);
	if (p < s)
	    croak(ErrHopBeforeStart);
	uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
	if (getCombinClass(uv) == 0) /* Last Starter found */
	    break;
    }

    svp = sv_2mortal(newSVpvn((char*)s, p - s));
    SvUTF8_on(svp);
    XPUSHs(svp);

    svp = sv_2mortal(newSVpvn((char*)p, e - p));
    SvUTF8_on(svp);
    XPUSHs(svp);


--- NEW FILE: Normalize.pm ---
package Unicode::Normalize;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

no warnings 'utf8';

our $VERSION = '0.32';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
    FCD checkFCD FCC checkFCC composeContiguous
    splitOnLastStarter
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
);

######

bootstrap Unicode::Normalize $VERSION;

######

sub pack_U {
    return pack('U*', @_);
}

sub unpack_U {
    return unpack('U*', pack('U*').shift);
}


##
## normalization forms
##

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub FCD ($) {
    my $str = shift;
    return checkFCD($str) ? $str : NFD($str);
}
sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }

our %formNorm = (
    NFC  => \&NFC,	C  => \&NFC,
    NFD  => \&NFD,	D  => \&NFD,
    NFKC => \&NFKC,	KC => \&NFKC,
    NFKD => \&NFKD,	KD => \&NFKD,
    FCD  => \&FCD,	FCC => \&FCC,
);

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    return exists $formNorm{$form}
	? $formNorm{$form}->($str)
	: croak $PACKAGE."::normalize: invalid form name: $form";
}


##
## quick check
##

our %formCheck = (
    NFC  => \&checkNFC, 	C  => \&checkNFC,
    NFD  => \&checkNFD, 	D  => \&checkNFD,
    NFKC => \&checkNFKC,	KC => \&checkNFKC,
    NFKD => \&checkNFKD,	KD => \&checkNFKD,
    FCD  => \&checkFCD, 	FCC => \&checkFCC,
);

sub check($$)
{
    my $form = shift;
    my $str = shift;
    return exists $formCheck{$form}
	? $formCheck{$form}->($str)
	: croak $PACKAGE."::check: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

(1) using function names exported by default:

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

(2) using function names exported on request:

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

Parameters:

C<$string> is used as a string under character semantics
(see F<perlunicode>).

C<$codepoint> should be an unsigned integer
representing a Unicode code point.

Note: Between XSUB and pure Perl, there is an incompatibility
about the interpretation of C<$codepoint> as a decimal number.
XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$codepoint>.

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$FCD_string = FCD($string)>

If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
returns it without modification; otherwise returns an FCD string.

Note: FCD is not always unique, then plural forms may be equivalent
each other. C<FCD()> will return one of these equivalent forms.

=item C<$FCC_string = FCC($string)>

returns the FCC form ("Fast C Contiguous"; cf. UTN #5).

Note: FCC is unique, as well as four normalization forms (NF*).

=item C<$normalized_string = normalize($form_name, $string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
  'KD' or 'NFKD' for Normalization Form KD (UAX #15)

  'FCD'          for "Fast C or D" Form  (UTN #5)
  'FCC'          for "Fast C Contiguous" (UTN #5)

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string)>

=item C<$decomposed_string = decompose($string, $useCompatMapping)>

Decomposes the specified string and returns the result.

If the second parameter (a boolean) is omitted or false, decomposes it
using the Canonical Decomposition Mapping.
If true, decomposes it using the Compatibility Decomposition Mapping.

The string returned is not always in NFD/NFKD.
Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string  = reorder($string)>

Reorders the combining characters and the like in the canonical ordering
and returns the result.

E.g., when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string   = compose($string)>

Returns the string where composable pairs are composed.

E.g., when you have a NFD/NFKD string,
you can get its NFC/NFKC string, saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkNFC($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

=item C<$result = checkNFKD($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkNFKC($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

=item C<$result = checkFCD($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkFCC($string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.

=item C<$result = check($form_name, $string)>

returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
  'KD' or 'NFKD' for Normalization Form KD (UAX #15)

  'FCD'          for "Fast C or D" Form  (UTN #5)
  'FCC'          for "Fast C Contiguous" (UTN #5)

=back

B<Note>

In the cases of NFD, NFKD, and FCD, the answer must be
either C<YES> or C<NO>. The answer C<MAYBE> may be returned
in the cases of NFC, NFKC, and FCC.

A C<MAYBE> string should contain at least one combining character
or the like. For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.

Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC/FCC.

    if ($string eq NFC($string)) {
	# $string is exactly normalized in NFC;
    } else {
	# $string is not normalized in NFC;
    }

    if ($string eq NFKC($string)) {
	# $string is exactly normalized in NFKC;
    } else {
	# $string is not normalized in NFKC;
    }

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

If the character of the specified codepoint is canonically
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is compatibility
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.

If they are not composable, returns C<undef>.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class of the character as an integer.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint
is a composition exclusion.

=item C<$is_singleton = isSingleton($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a singleton.

=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>

Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
is a Non-Starter Decomposition.

=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>

Returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=back

=head1 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 CAVEATS

=over 4

=item Perl's version vs. Unicode version

Since this module refers to perl core's Unicode database in the directory
F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
normalization implemented by this module depends on your perl's version.

    perl's version         implemented Unicode version
       5.6.1                  3.0.1
       5.7.2                  3.1.0
       5.7.3                  3.1.1 (same normalized form as that of 3.1.0)
       5.8.0                  3.2.0
     5.8.1-5.8.3              4.0.0
     5.8.4-5.8.6 (latest)     4.0.1 (same normalized form as that of 4.0.0)

=item Correction of decomposition mapping

In older Unicode versions, a small number of characters (all of which are
CJK compatibility ideographs as far as they have been found) may have
an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
nor provide any specific version of normalization. Therefore this module
running on an older perl with an older Unicode database may use
the erroneous decomposition mapping blindly conforming to the Unicode database.

=item Revised definition of canonical composition

In Unicode 4.1.0, the definition D2 of canonical composition (which
affects NFC and NFKC) has been changed (see Public Review Issue #29
and recent UAX #15). This module has used the newer definition
since the version 0.07 (Oct 31, 2001).
This module does not support normalization according to the older
definition, even if the Unicode version implemented by perl is
lower than 4.1.0.

=back

=head1 AUTHOR

SADAHIRO Tomoyuki <SADAHIRO at cpan.org>

Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.

This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt

Normalization Corrections

=item http://www.unicode.org/review/pr-29.html

Public Review Issue #29: Normalization Issue

=item http://www.unicode.org/notes/tn5/

Canonical Equivalence in Applications - UTN #5

=back

=cut

--- NEW FILE: mkheader ---
#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Normalize.
#
# Usage:
#    <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input files:
#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
#    unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
#
# Output files:
#    unfcan.h
#    unfcpt.h
#    unfcmb.h
#    unfcmp.h
#    unfexc.h
#
use 5.006;
use strict;
use warnings;
use Carp;
use File::Spec;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

our $PACKAGE = 'Unicode::Normalize, mkheader';

our $Combin = do "unicore/CombiningClass.pl"
    || do "unicode/CombiningClass.pl"
    || croak "$PACKAGE: CombiningClass.pl not found";

our $Decomp = do "unicore/Decomposition.pl"
    || do "unicode/Decomposition.pl"
    || croak "$PACKAGE: Decomposition.pl not found";

our %Combin;	# $codepoint => $number    : combination class
our %Canon;	# $codepoint => \@codepoints : canonical decomp.
our %Compat;	# $codepoint => \@codepoints : compat. decomp.
# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
our %Exclus;	# $codepoint => 1          : composition exclusions
our %Single;	# $codepoint => 1          : singletons
our %NonStD;	# $codepoint => 1          : non-starter decompositions

our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
our %CompList;	# $listname,$2nd  => $codepoint : composite

our $prefix = "UNF_";
our $structname = "${prefix}complist";

########## definition of Hangul constants ##########
use constant SBase  => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
use constant SCount =>  11172; # LCount * NCount
use constant NCount =>    588; # VCount * TCount
use constant LBase  => 0x1100;
use constant LFinal => 0x1112;
use constant LCount =>     19;
use constant VBase  => 0x1161;
use constant VFinal => 0x1175;
use constant VCount =>     21;
use constant TBase  => 0x11A7;
use constant TFinal => 0x11C2;
use constant TCount =>     28;

sub decomposeHangul {
    my $SIndex = $_[0] - SBase;
    my $LIndex = int( $SIndex / NCount);
    my $VIndex = int(($SIndex % NCount) / TCount);
    my $TIndex =      $SIndex % TCount;
    my @ret = (
       LBase + $LIndex,
       VBase + $VIndex,
      $TIndex ? (TBase + $TIndex) : (),
    );
    wantarray ? @ret : pack('U*', @ret);
     # any element in @ret greater than 0xFF, so no need of u2n conversion.
}

########## getting full decomposion ##########
{
    my($f, $fh);
    foreach my $d (@INC) {
	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
	last if open($fh, $f);
	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
	last if open($fh, $f);
	$f = undef;
    }
    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;

    while (<$fh>) {
	next if /^#/ or /^$/;
	s/#.*//;
	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
    }
    close $fh;
}

##
## converts string "hhhh hhhh hhhh" to a numeric list
##
sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }

while ($Combin =~ /(.+)/g) {
    my @tab = split /\t/, $1;
    my $ini = hex $tab[0];
    if ($tab[1] eq '') {
	$Combin{ $ini } = $tab[2];
    } else {
	$Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
    }
}

while ($Decomp =~ /(.+)/g) {
    my @tab = split /\t/, $1;
    my $compat = $tab[2] =~ s/<[^>]+>//;
    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
    my $ini = hex($tab[0]); # initial decomposable character

    my $listname =
	@$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
		# %04x is bad since it'd place _3046 after _1d157.

    if ($tab[1] eq '') {
	$Compat{ $ini } = $dec;

	if (! $compat) {
	    $Canon{ $ini } = $dec;

	    if (@$dec == 2) {
		if ($Combin{ $dec->[0] }) {
		    $NonStD{ $ini } = 1;
		} else {
		    $CompList{ $listname }{ $dec->[1] } = $ini;
		    $Comp1st{ $dec->[0] } = $listname;
		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
		}
	    } elsif (@$dec == 1) {
		$Single{ $ini } = 1;
	    } else {
		croak("Weird Canonical Decomposition of U+$tab[0]");
	    }
	}
    } else {
	foreach my $u ($ini .. hex($tab[1])) {
	    $Compat{ $u } = $dec;

	    if (! $compat) {
		$Canon{ $u } = $dec;

		if (@$dec == 2) {
		    if ($Combin{ $dec->[0] }) {
			$NonStD{ $u } = 1;
		    } else {
			$CompList{ $listname }{ $dec->[1] } = $u;
			$Comp1st{ $dec->[0] } = $listname;
			$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
		    }
		} elsif (@$dec == 1) {
		    $Single{ $u } = 1;
		} else {
		    croak("Weird Canonical Decomposition of U+$tab[0]");
		}
	    }
	}
    }
}

# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
    $Comp2nd{$j} = 1;
}

sub getCanonList {
    my @src = @_;
    my @dec = map {
	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
	    : $Canon{$_} ? @{ $Canon{$_} } : $_
		} @src;
    return join(" ", at src) eq join(" ", at dec) ? @dec : getCanonList(@dec);
    # condition @src == @dec is not ok.
}

sub getCompatList {
    my @src = @_;
    my @dec = map {
	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
	    : $Compat{$_} ? @{ $Compat{$_} } : $_
		} @src;
    return join(" ", at src) eq join(" ", at dec) ? @dec : getCompatList(@dec);
    # condition @src == @dec is not ok.
}

# exhaustive decomposition
foreach my $key (keys %Canon) {
    $Canon{$key}  = [ getCanonList($key) ];
}

# exhaustive decomposition
foreach my $key (keys %Compat) {
    $Compat{$key} = [ getCompatList($key) ];
}

sub _pack_U {
    return pack('U*', @_);
}

sub _U_stringify {
    sprintf '"%s"', join '',
	map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
}

foreach my $hash (\%Canon, \%Compat) {
    foreach my $key (keys %$hash) {
	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
    }
}

########## writing header files ##########

my @boolfunc = (
    {
	name => "Exclusion",
	type => "bool",
	hash => \%Exclus,
    },
    {
	name => "Singleton",
	type => "bool",
	hash => \%Single,
    },
    {
	name => "NonStDecomp",
	type => "bool",
	hash => \%NonStD,
    },
    {
	name => "Comp2nd",
	type => "bool",
	hash => \%Comp2nd,
    },
);

my $file = "unfexc.h";
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;

    print << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

foreach my $tbl (@boolfunc) {
    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
    my $type = $tbl->{type};
    my $name = $tbl->{name};
    print "$type is$name (UV uv)\n{\nreturn\n\t";

    while (@temp) {
	my $cur = shift @temp;
	if (@temp && $cur + 1 == $temp[0]) {
	    print "($cur <= uv && uv <= ";
	    while (@temp && $cur + 1 == $temp[0]) {
		$cur = shift @temp;
	    }
	    print "$cur)";
	    print "\n\t|| " if @temp;
	} else {
	    print "uv == $cur";
	    print "\n\t|| " if @temp;
	}
    }
    print "\n\t? TRUE : FALSE;\n}\n\n";
}

close FH;

####################################

my $compinit =
    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";

foreach my $i (sort keys %CompList) {
    $compinit .= "$structname $i [] = {\n";
    $compinit .= join ",\n",
	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
	    sort {$a <=> $b } keys %{ $CompList{$i} };
    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
}

my @tripletable = (
    {
	file => "unfcmb",
	name => "combin",
	type => "STDCHAR",
	hash => \%Combin,
	null =>  0,
    },
    {
	file => "unfcan",
	name => "canon",
	type => "char*",
	hash => \%Canon,
	null => "NULL",
    },
    {
	file => "unfcpt",
	name => "compat",
	type => "char*",
	hash => \%Compat,
	null => "NULL",
    },
    {
	file => "unfcmp",
	name => "compos",
	type => "$structname *",
	hash => \%Comp1st,
	null => "NULL",
	init => $compinit,
    },
);

foreach my $tbl (@tripletable) {
    my $file = "$tbl->{file}.h";
    my $head = "${prefix}$tbl->{name}";
    my $type = $tbl->{type};
    my $hash = $tbl->{hash};
    my $null = $tbl->{null};
    my $init = $tbl->{init};

    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
    binmode FH; select FH;
    my %val;

    print FH << 'EOF';
/*
 * This file is auto-generated by mkheader.
 * Any changes here will be lost!
 */
EOF

    print $init if defined $init;

    foreach my $uv (keys %$hash) {
	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
	    unless $uv <= 0x10FFFF;
	my @c = unpack 'CCCC', pack 'N', $uv;
	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
    }

    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	for (my $r = 0; $r < 256; $r++) {
	    next if ! $val{ $p }{ $r };
	    printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
	    for (my $c = 0; $c < 256; $c++) {
		print "\t", defined $val{$p}{$r}{$c}
		    ? "($type)".$val{$p}{$r}{$c}
		    : $null;
		print ','  if $c != 255;
		print "\n" if $c % 8 == 7;
	    }
	    print "};\n\n";
	}
    }
    foreach my $p (sort { $a <=> $b } keys %val) {
	next if ! $val{ $p };
	printf "$type* ${head}_%02x [256] = {\n", $p;
	for (my $r = 0; $r < 256; $r++) {
	    print $val{ $p }{ $r }
		? sprintf("${head}_%02x_%02x", $p, $r)
		: "NULL";
	    print ','  if $r != 255;
	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
	}
	print "};\n\n";
    }
    print "$type** $head [] = {\n";
    for (my $p = 0; $p <= 0x10; $p++) {
	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
	print ','  if $p != 0x10;
	print "\n";
    }
    print "};\n\n";
    close FH;
}

1;
__END__

--- NEW FILE: README ---
Unicode/Normalize version 0.28
===================================

Unicode::Normalize - Unicode Normalization Forms

SYNOPSIS

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

  or

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC


INSTALLATION

Perl 5.6.1 or later.
(Perl 5.8.0 or later is recommended.)

To install this module (XSUB: needs a C compiler), type the following:

   perl Makefile.PL
   make
   make test
   make install

If you want to install pure Perl (i.e. no-XSUB),
type the following (!! "disableXS" must run before "Makefile.PL" !!):

   perl disableXS
   perl Makefile.PL
   make
   make test
   make install

After building no-XSUB, if you decide to install XSUB,
type the following (!! "enableXS" must run before "Makefile.PL" !!):

   make clean
   perl enableXS
   perl Makefile.PL
   make
   make test
   make install

DEPENDENCIES

This module requires other modules and libraries following:

Carp
Exporter
File::Copy
File::Spec

unicore/CombiningClass.pl         (or unicode/CombiningClass.pl)
unicore/Decomposition.pl          (or unicode/Decomposition.pl)
unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)

CAVEAT

(1) In the perl-current, unicore/CompExcl.txt
  is renamed unicore/CompositionExclusions.txt.

(2) After these unicore/*.* files are updated.

  In the case of an XS edition:
    You must rebuild the module,
    as the data will be compiled on building.

  In the case of a pure Perl edition:
    Rebuilding is not necessary,
    as the data will be read on requirement.

(3) Pure Perl edition, Normalize.pmN, may work without any other file
    in this distribution (it must be renamed Normalize.pm, though)

COPYRIGHT AND LICENCE

  SADAHIRO Tomoyuki <SADAHIRO at cpan.org>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This module is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.

--- NEW FILE: Changes ---
Revision history for Perl extension Unicode::Normalize.

0.32  Tue Apr  5 22:47:09 2005
    - Some literal and grammatical errors in POD are fixed.

0.31  Tue Apr  5 21:43:20 2005
    - CAVEATS in POD is added.
    - Some test cases from Unicode Public Review Issue #29
      (Normalization Issue) are added to norm.t and test.t.
    - do 'mkheader' returns true so that Makefile.PL will catch error.
    - META.yml is added.

0.30  Sun May  2 14:35:00 2004
    - XSUB: (perl 5.8.1 or later) improved utf8 upgrade of non-POK
      (private POK) values like tied scalars, overloaded objects, etc.

0.28  Sat Nov 22 23:46:24 2003
    - XSUB: even if string contains a malformed, "short" Unicode character,
      decompose() and reorder() will be safe. Garbage will be no longer added.
    - added null.t and short.t.
    - now truely added illegal.t (in 0.27, forgot to change MANIFEST).

0.27  Sun Nov 16 13:16:21 2003
    - Illegal code points (surrogate and noncharacter) will be allowed
      (keep your code with <no warnings 'utf8';>);
      but porting is not successful in the case of ((Pure Perl) and
      (Perl 5.7.3 or before)).
    - added illegal.t.

0.26  Sat Nov 15 21:52:30 2003
    - doc fix: s/FCD(?= is unique)/FCC/;

0.25  Mon Oct  6 22:26:03 2003
    - added form.t and proto.t.

0.24  Sat Oct  4 17:57:10 2003
    - supports FCD and FCC (UTN #5):
      FCD(), normalize('FCD'), checkFCD(), check('FCD');
      FCC(), normalize('FCC'), checkFCC(), check('FCC').
    - changed INSTALLATION (cf. README).
      * Initial state of the distribution is changed to XSUB.  To build
        pure Perl, type <perl disableXS> before <perl Makefile.PL>.
      * The purePerl-XSUB converter is now provided as two perl
        script files, named "enableXS" and "disableXS".
        (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
      * simplified Makefile.PL.
    - added fcdc.t and split.t.

0.23  Sat Jun 28 20:38:10 2003
    - bug fix: \0-terminate in compose() in XS.
    - tweak in pure perl: forced $codepoint to numeric (i.e. "+0065" to 65)
    - tweak of POD and README.

0.22  Mon Jun 09 22:23:10 2003
    - internal tweak (again): pack_U() and unpack_U().

0.21  Thu Apr 02 23:12:54 2003
    - internal tweak: for (?un)pack 'U'.

0.20  Sun Mar 02 13:29:25 2003
    - decompose Hangul syllables in a decomposition mapping.

0.18  ... unreleased
    - synchronization with bleadperl.
    - Change 16262: by me

0.17  Sun Apr 28 23:13:32 2002
    - now normalize('NFC',$1) should work.
    - Some croak()'s are added in mkheader.
    - synchronization with bleadperl.
    - Change 15596: by me
    - Change 16136: by pudge

0.16  Thu Mar 21 13:36:14 2002
    - synchronization with bleadperl.
    - Change 15318: by jhi
    - Change 15319: by jhi

0.15  Tue Mar 19 22:04:07 2002
    - Quick check is implemented.
    - decompose(), reorder(), and compose() are documented.
    - The Non-XS version is also independent of Lingua::KO::Hangul::Util.

0.14  Sat Feb 02 20:40:14 2002
    - synchronization with bleadperl.
    - Change 14128: by Arthur
    - Change 14129: by jhi
    - Change 14156: 
    - Change 14199: by Nikola Knezevic
    - Change 14308: by Benjamin Goldberg
    - Change 14370: by jhi

0.13  Sat Dec 01 11:42:43 2001
    - modify Makefile.PL to enable rebuild.
      (This problem is pointed out by David Dyck.)
    - Change 13388: by Jarkko Hietaniemi.

0.12  Wed Nov 29 22:49:02 2001
    - documentation in .pod is appended to .pm and the .pod is removed.
     (only POD in NON-XS refers to Lingua::KO::Hangul::Util.)

0.11  Sat Nov 24 10:18:38 2001
    - documentation of some functions for character data.
    - Change 12909: by Jarkko Hietaniemi.
    - Change 13228: by Peter Prymmer.

0.10  Sat Nov 03 16:30:20 2001
    - The XS version is now independent of Lingua::KO::Hangul::Util.
      (though the Non-XS version still requires that.)

0.09  Fri Nov 02 22:39:30 2001
    - remove pTHX_.

0.08  Thu Nov 01 23:20:42 2001
    - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".

0.07  Wed Oct 31 22:06:42 2001
    - modify internal. decompose() - reorder() - compose().

0.06  Sun Oct 28 14:28:46 2001
    - an XS version.
    (but the Non-XS version is also supported.)

0.05  Wed Oct 10 22:02:15 2001 (not released)
    - %Compos contains unnecessary singletons
      (though it did not cause any bug, only useless).
      They will not be stored.

0.04  Wed Aug 15 19:02:41 2001
    - fix: NFD("") and NFKD("") must return "", not but undef.

0.03  Fri Aug 10 22:44:18 2001
    - rename the module name to Unicode::Normalize.
    - normalize takes two arguments.

0.02  Thu Aug  9 22:56:36 2001
    - add function normalize

0.01  Mon Aug  6 21:45:11 2001
    - original version; created by h2xs 1.21 with options
          -A -X -n Text::Unicode::Normalize





More information about the dslinux-commit mailing list