[project @ 2005-03-14 15:57:57 by simonmar]
authorsimonmar <unknown>
Mon, 14 Mar 2005 15:57:57 +0000 (15:57 +0000)
committersimonmar <unknown>
Mon, 14 Mar 2005 15:57:57 +0000 (15:57 +0000)
Add the script used to generate WCsubst.c

cbits/ubconfc [new file with mode: 0644]

diff --git a/cbits/ubconfc b/cbits/ubconfc
new file mode 100644 (file)
index 0000000..6344ecc
--- /dev/null
@@ -0,0 +1,339 @@
+#!/bin/sh
+
+# --------------------------------------------------------------------------
+# This is the script to create the unicode chars property table 
+# Written by Dimitry Golubovsky (dimitry@golubovsky.org) as part
+# of the Partial Unicode Support patch
+#
+# Adopted for use with GHC.
+# License: see libraries/base/LICENSE
+#
+# -------------------------------------------------------------------------
+
+#      The script reads the file from the standard input,
+#      and outputs C code into the standard output.
+#      The C code contains the chars property table, and basic functions
+#      to access properties.
+
+#      Output the file header
+
+echo "/*-------------------------------------------------------------------------"
+echo "This is an automatically generated file: do not edit"
+echo "Generated by `basename $0` at `date`"
+echo "-------------------------------------------------------------------------*/"
+echo
+echo "#include \"WCsubst.h\""
+
+#      Define structures
+
+cat <<EOF
+
+/* Unicode general categories, listed in the same order as in the Unicode
+ * standard -- this must be the same order as in GHC.Unicode.
+ */
+
+enum {
+    NUMCAT_LU,  /* Letter, Uppercase */
+    NUMCAT_LL,  /* Letter, Lowercase */
+    NUMCAT_LT,  /* Letter, Titlecase */
+    NUMCAT_LM,  /* Letter, Modifier */
+    NUMCAT_LO,  /* Letter, Other */
+    NUMCAT_MN,  /* Mark, Non-Spacing */
+    NUMCAT_MC,  /* Mark, Spacing Combining */
+    NUMCAT_ME,  /* Mark, Enclosing */
+    NUMCAT_ND,  /* Number, Decimal */
+    NUMCAT_NL,  /* Number, Letter */
+    NUMCAT_NO,  /* Number, Other */
+    NUMCAT_PC,  /* Punctuation, Connector */
+    NUMCAT_PD,  /* Punctuation, Dash */
+    NUMCAT_PS,  /* Punctuation, Open */
+    NUMCAT_PE,  /* Punctuation, Close */
+    NUMCAT_PI,  /* Punctuation, Initial quote */
+    NUMCAT_PF,  /* Punctuation, Final quote */
+    NUMCAT_PO,  /* Punctuation, Other */
+    NUMCAT_SM,  /* Symbol, Math */
+    NUMCAT_SC,  /* Symbol, Currency */
+    NUMCAT_SK,  /* Symbol, Modifier */
+    NUMCAT_SO,  /* Symbol, Other */
+    NUMCAT_ZS,  /* Separator, Space */
+    NUMCAT_ZL,  /* Separator, Line */
+    NUMCAT_ZP,  /* Separator, Paragraph */
+    NUMCAT_CC,  /* Other, Control */
+    NUMCAT_CF,  /* Other, Format */
+    NUMCAT_CS,  /* Other, Surrogate */
+    NUMCAT_CO,  /* Other, Private Use */
+    NUMCAT_CN   /* Other, Not Assigned */
+};
+
+struct _convrule_ 
+{ 
+       unsigned int category;
+       unsigned int catnumber;
+       int possible;
+       int updist;
+       int lowdist; 
+       int titledist;
+};
+
+struct _charblock_ 
+{ 
+       int start;
+       int length;
+       const struct _convrule_ *rule;
+};
+
+EOF
+
+#      Convert the stdin file to the C table
+
+awk '
+BEGIN {
+       FS=";"
+       catidx=0
+       rulidx=0
+       blockidx=0
+       cblckidx=0
+       sblckidx=0
+       blockb=-1
+       blockl=0
+       digs="0123456789ABCDEF"
+       for(i=0;i<16;i++)
+       {
+               hex[substr(digs,i+1,1)]=i;
+       }
+}
+function em1(a)
+{
+       if(a=="") return "-1"
+       return "0x"a
+}
+function h2d(a)
+{
+       l=length(a)
+       acc=0
+       for(i=1;i<=l;i++)
+       {
+               acc=acc*16+hex[substr(a,i,1)];
+       }
+       return acc
+}
+function dumpblock()
+{
+       blkd=blockb ", " blockl ", &rule" rules[blockr]
+       blocks[blockidx]=blkd
+       blockidx++
+       if(blockb<=256) lat1idx++
+       split(blockr,rsp,",")
+       if(substr(rsp[3],2,1)=="1")
+       {
+               cblcks[cblckidx]=blkd
+               cblckidx++
+       }
+       if(rsp[1]=="GENCAT_ZS")
+       {
+               sblcks[sblckidx]=blkd
+               sblckidx++
+       }
+       blockb=self
+       blockl=1
+       blockr=rule
+}
+{
+       name=$2
+       cat=toupper($3)
+       self=h2d($1)
+       up=h2d($13)
+       low=h2d($14)
+       title=h2d($15)
+       convpos=1
+       if((up==0)&&(low==0)&&(title==0)) convpos=0
+       if(up==0) up=self
+       if(low==0) low=self
+       if(title==0) title=self
+       updist=up-self
+       lowdist=low-self
+       titledist=title-self
+       rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
+                               ("1, " updist ", " lowdist ", " titledist):
+                               ("0, 0, 0, 0"))
+       if(cats[cat]=="")
+       {
+               cats[cat]=(2^catidx);
+               catidx++;
+       }
+       if(rules[rule]=="")
+       {
+               rules[rule]=rulidx;
+               rulidx++;
+       }
+       if(blockb==-1)
+       {
+               blockb=self
+               blockl=1
+               blockr=rule
+       }
+       else
+       {
+               if (index(name,"First>")!=0)
+               {
+                       dumpblock()
+               }
+               else if (index(name,"Last>")!=0)
+               {
+                       blockl+=(self-blockb)
+               }
+               else if((self==blockb+blockl)&&(rule==blockr)) blockl++
+               else
+               {
+                       dumpblock()
+               }
+       }
+}
+END {
+       dumpblock()
+       for(c in cats) print "#define GENCAT_"c" "cats[c]
+       print "#define MAX_UNI_CHAR " self
+       print "#define NUM_BLOCKS " blockidx
+       print "#define NUM_CONVBLOCKS " cblckidx
+       print "#define NUM_SPACEBLOCKS " sblckidx
+       print "#define NUM_LAT1BLOCKS " lat1idx
+        print "#define NUM_RULES " rulidx
+       for(r in rules)
+       {
+               printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
+       }
+       print "static const struct _charblock_ allchars[]={"
+       for(i=0;i<blockidx;i++)
+       {
+               printf "\t{" blocks[i] "}"
+               print (i<(blockidx-1))?",":"" 
+       }
+       print "};"
+       print "static const struct _charblock_ convchars[]={"
+       for(i=0;i<cblckidx;i++)
+       {
+               printf "\t{" cblcks[i] "}"
+               print (i<(cblckidx-1))?",":""
+       }
+        print "};"
+        print "static const struct _charblock_ spacechars[]={"
+        for(i=0;i<sblckidx;i++)
+        {       
+                printf "\t{" sblcks[i] "}"
+                print (i<(sblckidx-1))?",":""
+        }       
+       print "};"
+}
+'
+#      Output the C procedures code
+
+cat <<EOF
+
+/*
+       Obtain the reference to character rule by doing
+       binary search over the specified array of blocks.
+       To make checkattr shorter, the address of
+       nullrule is returned if the search fails:
+       this rule defines no category and no conversion
+       distances. The compare function returns 0 when
+       key->start is within the block. Otherwise
+       result of comparison of key->start and start of the
+       current block is returned as usual.
+*/
+
+static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};
+
+int blkcmp(const void *vk,const void *vb)
+{
+       const struct _charblock_ *key,*cur;
+       key=vk;
+       cur=vb;
+       if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
+       {
+               return 0;
+       }
+       if(key->start>cur->start) return 1;
+       return -1;
+}
+
+static const struct _convrule_ *getrule(
+       const struct _charblock_ *blocks,
+       int numblocks,
+       int unichar)
+{
+       struct _charblock_ key={unichar,1,(void *)0};
+       struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
+       if(cb==(void *)0) return &nullrule;
+       return cb->rule;
+}
+       
+
+
+/*
+       Check whether a character (internal code) has certain attributes.
+       Attributes (category flags) may be ORed. The function ANDs
+       character category flags and the mask and returns the result.
+       If the character belongs to one of the categories requested,
+       the result will be nonzero.
+*/
+
+inline static int checkattr(int c,unsigned int catmask)
+{
+       return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
+}
+
+inline static int checkattr_s(int c,unsigned int catmask)
+{
+        return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
+}
+
+/*
+       Define predicate functions for some combinations of categories.
+*/
+
+#define unipred(p,m) \\
+int p(int c) \\
+{ \\
+       return checkattr(c,m); \\
+}
+
+#define unipred_s(p,m) \\
+int p(int c) \\
+{ \\
+        return checkattr_s(c,m); \\
+}
+
+/*
+       Make these rules as close to Hugs as possible.
+*/
+
+unipred(u_iswcntrl,GENCAT_CC)
+unipred(u_iswprint,~(GENCAT_ZL|GENCAT_ZP|GENCAT_CC|GENCAT_CF|GENCAT_CS|GENCAT_CO))
+unipred_s(u_iswspace,GENCAT_ZS)
+unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
+unipred(u_iswlower,GENCAT_LL)
+unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
+unipred(u_iswdigit,GENCAT_ND)
+
+unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
+                   GENCAT_MC|GENCAT_ME|GENCAT_MN|
+                   GENCAT_NO|GENCAT_ND|GENCAT_NL))
+
+#define caseconv(p,to) \\
+int p(int c) \\
+{ \\
+       const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
+       if(rule==&nullrule) return c;\\
+       return c+rule->##to;\\
+}
+
+caseconv(u_towupper,updist)
+caseconv(u_towlower,lowdist)
+caseconv(u_towtitle,titledist)
+
+int u_gencat(int c)
+{
+       return getrule(allchars,NUM_BLOCKS,c)->catnumber;
+}
+
+EOF