+#!/bin/sh
+
+# --------------------------------------------------------------------------
+# This is the script to create the unicode chars property table
+# Written by Dimitry Golubovsky (dimitry@golubovsky.org) as part
+# of the Partial Unicode Support patch
+#
+# Adopted for use with GHC.
+# License: see libraries/base/LICENSE
+#
+# -------------------------------------------------------------------------
+
+# The script reads the file from the standard input,
+# and outputs C code into the standard output.
+# The C code contains the chars property table, and basic functions
+# to access properties.
+
+# Output the file header
+
+echo "/*-------------------------------------------------------------------------"
+echo "This is an automatically generated file: do not edit"
+echo "Generated by `basename $0` at `date`"
+echo "-------------------------------------------------------------------------*/"
+echo
+echo "#include \"WCsubst.h\""
+
+# Define structures
+
+cat <<EOF
+
+/* Unicode general categories, listed in the same order as in the Unicode
+ * standard -- this must be the same order as in GHC.Unicode.
+ */
+
+enum {
+ NUMCAT_LU, /* Letter, Uppercase */
+ NUMCAT_LL, /* Letter, Lowercase */
+ NUMCAT_LT, /* Letter, Titlecase */
+ NUMCAT_LM, /* Letter, Modifier */
+ NUMCAT_LO, /* Letter, Other */
+ NUMCAT_MN, /* Mark, Non-Spacing */
+ NUMCAT_MC, /* Mark, Spacing Combining */
+ NUMCAT_ME, /* Mark, Enclosing */
+ NUMCAT_ND, /* Number, Decimal */
+ NUMCAT_NL, /* Number, Letter */
+ NUMCAT_NO, /* Number, Other */
+ NUMCAT_PC, /* Punctuation, Connector */
+ NUMCAT_PD, /* Punctuation, Dash */
+ NUMCAT_PS, /* Punctuation, Open */
+ NUMCAT_PE, /* Punctuation, Close */
+ NUMCAT_PI, /* Punctuation, Initial quote */
+ NUMCAT_PF, /* Punctuation, Final quote */
+ NUMCAT_PO, /* Punctuation, Other */
+ NUMCAT_SM, /* Symbol, Math */
+ NUMCAT_SC, /* Symbol, Currency */
+ NUMCAT_SK, /* Symbol, Modifier */
+ NUMCAT_SO, /* Symbol, Other */
+ NUMCAT_ZS, /* Separator, Space */
+ NUMCAT_ZL, /* Separator, Line */
+ NUMCAT_ZP, /* Separator, Paragraph */
+ NUMCAT_CC, /* Other, Control */
+ NUMCAT_CF, /* Other, Format */
+ NUMCAT_CS, /* Other, Surrogate */
+ NUMCAT_CO, /* Other, Private Use */
+ NUMCAT_CN /* Other, Not Assigned */
+};
+
+struct _convrule_
+{
+ unsigned int category;
+ unsigned int catnumber;
+ int possible;
+ int updist;
+ int lowdist;
+ int titledist;
+};
+
+struct _charblock_
+{
+ int start;
+ int length;
+ const struct _convrule_ *rule;
+};
+
+EOF
+
+# Convert the stdin file to the C table
+
+awk '
+BEGIN {
+ FS=";"
+ catidx=0
+ rulidx=0
+ blockidx=0
+ cblckidx=0
+ sblckidx=0
+ blockb=-1
+ blockl=0
+ digs="0123456789ABCDEF"
+ for(i=0;i<16;i++)
+ {
+ hex[substr(digs,i+1,1)]=i;
+ }
+}
+function em1(a)
+{
+ if(a=="") return "-1"
+ return "0x"a
+}
+function h2d(a)
+{
+ l=length(a)
+ acc=0
+ for(i=1;i<=l;i++)
+ {
+ acc=acc*16+hex[substr(a,i,1)];
+ }
+ return acc
+}
+function dumpblock()
+{
+ blkd=blockb ", " blockl ", &rule" rules[blockr]
+ blocks[blockidx]=blkd
+ blockidx++
+ if(blockb<=256) lat1idx++
+ split(blockr,rsp,",")
+ if(substr(rsp[3],2,1)=="1")
+ {
+ cblcks[cblckidx]=blkd
+ cblckidx++
+ }
+ if(rsp[1]=="GENCAT_ZS")
+ {
+ sblcks[sblckidx]=blkd
+ sblckidx++
+ }
+ blockb=self
+ blockl=1
+ blockr=rule
+}
+{
+ name=$2
+ cat=toupper($3)
+ self=h2d($1)
+ up=h2d($13)
+ low=h2d($14)
+ title=h2d($15)
+ convpos=1
+ if((up==0)&&(low==0)&&(title==0)) convpos=0
+ if(up==0) up=self
+ if(low==0) low=self
+ if(title==0) title=self
+ updist=up-self
+ lowdist=low-self
+ titledist=title-self
+ rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
+ ("1, " updist ", " lowdist ", " titledist):
+ ("0, 0, 0, 0"))
+ if(cats[cat]=="")
+ {
+ cats[cat]=(2^catidx);
+ catidx++;
+ }
+ if(rules[rule]=="")
+ {
+ rules[rule]=rulidx;
+ rulidx++;
+ }
+ if(blockb==-1)
+ {
+ blockb=self
+ blockl=1
+ blockr=rule
+ }
+ else
+ {
+ if (index(name,"First>")!=0)
+ {
+ dumpblock()
+ }
+ else if (index(name,"Last>")!=0)
+ {
+ blockl+=(self-blockb)
+ }
+ else if((self==blockb+blockl)&&(rule==blockr)) blockl++
+ else
+ {
+ dumpblock()
+ }
+ }
+}
+END {
+ dumpblock()
+ for(c in cats) print "#define GENCAT_"c" "cats[c]
+ print "#define MAX_UNI_CHAR " self
+ print "#define NUM_BLOCKS " blockidx
+ print "#define NUM_CONVBLOCKS " cblckidx
+ print "#define NUM_SPACEBLOCKS " sblckidx
+ print "#define NUM_LAT1BLOCKS " lat1idx
+ print "#define NUM_RULES " rulidx
+ for(r in rules)
+ {
+ printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
+ }
+ print "static const struct _charblock_ allchars[]={"
+ for(i=0;i<blockidx;i++)
+ {
+ printf "\t{" blocks[i] "}"
+ print (i<(blockidx-1))?",":""
+ }
+ print "};"
+ print "static const struct _charblock_ convchars[]={"
+ for(i=0;i<cblckidx;i++)
+ {
+ printf "\t{" cblcks[i] "}"
+ print (i<(cblckidx-1))?",":""
+ }
+ print "};"
+ print "static const struct _charblock_ spacechars[]={"
+ for(i=0;i<sblckidx;i++)
+ {
+ printf "\t{" sblcks[i] "}"
+ print (i<(sblckidx-1))?",":""
+ }
+ print "};"
+}
+'
+# Output the C procedures code
+
+cat <<EOF
+
+/*
+ Obtain the reference to character rule by doing
+ binary search over the specified array of blocks.
+ To make checkattr shorter, the address of
+ nullrule is returned if the search fails:
+ this rule defines no category and no conversion
+ distances. The compare function returns 0 when
+ key->start is within the block. Otherwise
+ result of comparison of key->start and start of the
+ current block is returned as usual.
+*/
+
+static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};
+
+int blkcmp(const void *vk,const void *vb)
+{
+ const struct _charblock_ *key,*cur;
+ key=vk;
+ cur=vb;
+ if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
+ {
+ return 0;
+ }
+ if(key->start>cur->start) return 1;
+ return -1;
+}
+
+static const struct _convrule_ *getrule(
+ const struct _charblock_ *blocks,
+ int numblocks,
+ int unichar)
+{
+ struct _charblock_ key={unichar,1,(void *)0};
+ struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
+ if(cb==(void *)0) return &nullrule;
+ return cb->rule;
+}
+
+
+
+/*
+ Check whether a character (internal code) has certain attributes.
+ Attributes (category flags) may be ORed. The function ANDs
+ character category flags and the mask and returns the result.
+ If the character belongs to one of the categories requested,
+ the result will be nonzero.
+*/
+
+inline static int checkattr(int c,unsigned int catmask)
+{
+ return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
+}
+
+inline static int checkattr_s(int c,unsigned int catmask)
+{
+ return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
+}
+
+/*
+ Define predicate functions for some combinations of categories.
+*/
+
+#define unipred(p,m) \\
+int p(int c) \\
+{ \\
+ return checkattr(c,m); \\
+}
+
+#define unipred_s(p,m) \\
+int p(int c) \\
+{ \\
+ return checkattr_s(c,m); \\
+}
+
+/*
+ Make these rules as close to Hugs as possible.
+*/
+
+unipred(u_iswcntrl,GENCAT_CC)
+unipred(u_iswprint,~(GENCAT_ZL|GENCAT_ZP|GENCAT_CC|GENCAT_CF|GENCAT_CS|GENCAT_CO))
+unipred_s(u_iswspace,GENCAT_ZS)
+unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
+unipred(u_iswlower,GENCAT_LL)
+unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
+unipred(u_iswdigit,GENCAT_ND)
+
+unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
+ GENCAT_MC|GENCAT_ME|GENCAT_MN|
+ GENCAT_NO|GENCAT_ND|GENCAT_NL))
+
+#define caseconv(p,to) \\
+int p(int c) \\
+{ \\
+ const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
+ if(rule==&nullrule) return c;\\
+ return c+rule->##to;\\
+}
+
+caseconv(u_towupper,updist)
+caseconv(u_towlower,lowdist)
+caseconv(u_towtitle,titledist)
+
+int u_gencat(int c)
+{
+ return getrule(allchars,NUM_BLOCKS,c)->catnumber;
+}
+
+EOF