[project @ 2005-10-22 00:37:01 by ross]
[haskell-directory.git] / cbits / ubconfc
1 #!/bin/sh
2
3 # --------------------------------------------------------------------------
4 # This is the script to create the unicode chars property table 
5 # Written by Dimitry Golubovsky (dimitry@golubovsky.org) as part
6 # of the Partial Unicode Support patch
7 #
8 # Adopted for use with GHC.
9 # License: see libraries/base/LICENSE
10 #
11 # -------------------------------------------------------------------------
12
13 #       The script reads the file from the standard input,
14 #       and outputs C code into the standard output.
15 #       The C code contains the chars property table, and basic functions
16 #       to access properties.
17
18 #       Output the file header
19
20 echo "/*-------------------------------------------------------------------------"
21 echo "This is an automatically generated file: do not edit"
22 echo "Generated by `basename $0` at `date`"
23 echo "-------------------------------------------------------------------------*/"
24 echo
25 echo "#include \"WCsubst.h\""
26
27 #       Define structures
28
29 cat <<EOF
30
31 /* Unicode general categories, listed in the same order as in the Unicode
32  * standard -- this must be the same order as in GHC.Unicode.
33  */
34
35 enum {
36     NUMCAT_LU,  /* Letter, Uppercase */
37     NUMCAT_LL,  /* Letter, Lowercase */
38     NUMCAT_LT,  /* Letter, Titlecase */
39     NUMCAT_LM,  /* Letter, Modifier */
40     NUMCAT_LO,  /* Letter, Other */
41     NUMCAT_MN,  /* Mark, Non-Spacing */
42     NUMCAT_MC,  /* Mark, Spacing Combining */
43     NUMCAT_ME,  /* Mark, Enclosing */
44     NUMCAT_ND,  /* Number, Decimal */
45     NUMCAT_NL,  /* Number, Letter */
46     NUMCAT_NO,  /* Number, Other */
47     NUMCAT_PC,  /* Punctuation, Connector */
48     NUMCAT_PD,  /* Punctuation, Dash */
49     NUMCAT_PS,  /* Punctuation, Open */
50     NUMCAT_PE,  /* Punctuation, Close */
51     NUMCAT_PI,  /* Punctuation, Initial quote */
52     NUMCAT_PF,  /* Punctuation, Final quote */
53     NUMCAT_PO,  /* Punctuation, Other */
54     NUMCAT_SM,  /* Symbol, Math */
55     NUMCAT_SC,  /* Symbol, Currency */
56     NUMCAT_SK,  /* Symbol, Modifier */
57     NUMCAT_SO,  /* Symbol, Other */
58     NUMCAT_ZS,  /* Separator, Space */
59     NUMCAT_ZL,  /* Separator, Line */
60     NUMCAT_ZP,  /* Separator, Paragraph */
61     NUMCAT_CC,  /* Other, Control */
62     NUMCAT_CF,  /* Other, Format */
63     NUMCAT_CS,  /* Other, Surrogate */
64     NUMCAT_CO,  /* Other, Private Use */
65     NUMCAT_CN   /* Other, Not Assigned */
66 };
67
68 struct _convrule_ 
69
70         unsigned int category;
71         unsigned int catnumber;
72         int possible;
73         int updist;
74         int lowdist; 
75         int titledist;
76 };
77
78 struct _charblock_ 
79
80         int start;
81         int length;
82         const struct _convrule_ *rule;
83 };
84
85 EOF
86
87 #       Convert the stdin file to the C table
88
89 awk '
90 BEGIN {
91         FS=";"
92         catidx=0
93         rulidx=0
94         blockidx=0
95         cblckidx=0
96         sblckidx=0
97         blockb=-1
98         blockl=0
99         digs="0123456789ABCDEF"
100         for(i=0;i<16;i++)
101         {
102                 hex[substr(digs,i+1,1)]=i;
103         }
104 }
105 function em1(a)
106 {
107         if(a=="") return "-1"
108         return "0x"a
109 }
110 function h2d(a)
111 {
112         l=length(a)
113         acc=0
114         for(i=1;i<=l;i++)
115         {
116                 acc=acc*16+hex[substr(a,i,1)];
117         }
118         return acc
119 }
120 function dumpblock()
121 {
122         blkd=blockb ", " blockl ", &rule" rules[blockr]
123         blocks[blockidx]=blkd
124         blockidx++
125         if(blockb<=256) lat1idx++
126         split(blockr,rsp,",")
127         if(substr(rsp[3],2,1)=="1")
128         {
129                 cblcks[cblckidx]=blkd
130                 cblckidx++
131         }
132         if(rsp[1]=="GENCAT_ZS")
133         {
134                 sblcks[sblckidx]=blkd
135                 sblckidx++
136         }
137         blockb=self
138         blockl=1
139         blockr=rule
140 }
141 {
142         name=$2
143         cat=toupper($3)
144         self=h2d($1)
145         up=h2d($13)
146         low=h2d($14)
147         title=h2d($15)
148         convpos=1
149         if((up==0)&&(low==0)&&(title==0)) convpos=0
150         if(up==0) up=self
151         if(low==0) low=self
152         if(title==0) title=self
153         updist=up-self
154         lowdist=low-self
155         titledist=title-self
156         rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
157                                 ("1, " updist ", " lowdist ", " titledist):
158                                 ("0, 0, 0, 0"))
159         if(cats[cat]=="")
160         {
161                 cats[cat]=(2^catidx);
162                 catidx++;
163         }
164         if(rules[rule]=="")
165         {
166                 rules[rule]=rulidx;
167                 rulidx++;
168         }
169         if(blockb==-1)
170         {
171                 blockb=self
172                 blockl=1
173                 blockr=rule
174         }
175         else
176         {
177                 if (index(name,"First>")!=0)
178                 {
179                         dumpblock()
180                 }
181                 else if (index(name,"Last>")!=0)
182                 {
183                         blockl+=(self-blockb)
184                 }
185                 else if((self==blockb+blockl)&&(rule==blockr)) blockl++
186                 else
187                 {
188                         dumpblock()
189                 }
190         }
191 }
192 END {
193         dumpblock()
194         for(c in cats) print "#define GENCAT_"c" "cats[c]
195         print "#define MAX_UNI_CHAR " self
196         print "#define NUM_BLOCKS " blockidx
197         print "#define NUM_CONVBLOCKS " cblckidx
198         print "#define NUM_SPACEBLOCKS " sblckidx
199         print "#define NUM_LAT1BLOCKS " lat1idx
200         print "#define NUM_RULES " rulidx
201         for(r in rules)
202         {
203                 printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
204         }
205         print "static const struct _charblock_ allchars[]={"
206         for(i=0;i<blockidx;i++)
207         {
208                 printf "\t{" blocks[i] "}"
209                 print (i<(blockidx-1))?",":"" 
210         }
211         print "};"
212         print "static const struct _charblock_ convchars[]={"
213         for(i=0;i<cblckidx;i++)
214         {
215                 printf "\t{" cblcks[i] "}"
216                 print (i<(cblckidx-1))?",":""
217         }
218         print "};"
219         print "static const struct _charblock_ spacechars[]={"
220         for(i=0;i<sblckidx;i++)
221         {       
222                 printf "\t{" sblcks[i] "}"
223                 print (i<(sblckidx-1))?",":""
224         }       
225         print "};"
226 }
227 '
228 #       Output the C procedures code
229
230 cat <<EOF
231
232 /*
233         Obtain the reference to character rule by doing
234         binary search over the specified array of blocks.
235         To make checkattr shorter, the address of
236         nullrule is returned if the search fails:
237         this rule defines no category and no conversion
238         distances. The compare function returns 0 when
239         key->start is within the block. Otherwise
240         result of comparison of key->start and start of the
241         current block is returned as usual.
242 */
243
244 static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};
245
246 int blkcmp(const void *vk,const void *vb)
247 {
248         const struct _charblock_ *key,*cur;
249         key=vk;
250         cur=vb;
251         if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
252         {
253                 return 0;
254         }
255         if(key->start>cur->start) return 1;
256         return -1;
257 }
258
259 static const struct _convrule_ *getrule(
260         const struct _charblock_ *blocks,
261         int numblocks,
262         int unichar)
263 {
264         struct _charblock_ key={unichar,1,(void *)0};
265         struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
266         if(cb==(void *)0) return &nullrule;
267         return cb->rule;
268 }
269         
270
271
272 /*
273         Check whether a character (internal code) has certain attributes.
274         Attributes (category flags) may be ORed. The function ANDs
275         character category flags and the mask and returns the result.
276         If the character belongs to one of the categories requested,
277         the result will be nonzero.
278 */
279
280 inline static int checkattr(int c,unsigned int catmask)
281 {
282         return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
283 }
284
285 inline static int checkattr_s(int c,unsigned int catmask)
286 {
287         return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
288 }
289
290 /*
291         Define predicate functions for some combinations of categories.
292 */
293
294 #define unipred(p,m) \\
295 int p(int c) \\
296 { \\
297         return checkattr(c,m); \\
298 }
299
300 #define unipred_s(p,m) \\
301 int p(int c) \\
302 { \\
303         return checkattr_s(c,m); \\
304 }
305
306 /*
307         Make these rules as close to Hugs as possible.
308 */
309
310 unipred(u_iswcntrl,GENCAT_CC)
311 unipred(u_iswprint, \
312 (GENCAT_MC | GENCAT_NO | GENCAT_SK | GENCAT_ME | GENCAT_ND | \
313   GENCAT_PO | GENCAT_LT | GENCAT_PC | GENCAT_SM | GENCAT_ZS | \
314   GENCAT_LU | GENCAT_PD | GENCAT_SO | GENCAT_PE | GENCAT_PF | \
315   GENCAT_PS | GENCAT_SC | GENCAT_LL | GENCAT_LM | GENCAT_PI | \
316   GENCAT_NL | GENCAT_MN | GENCAT_LO))
317 unipred_s(u_iswspace,GENCAT_ZS)
318 unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
319 unipred(u_iswlower,GENCAT_LL)
320 unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
321 unipred(u_iswdigit,GENCAT_ND)
322
323 unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
324                     GENCAT_MC|GENCAT_ME|GENCAT_MN|
325                     GENCAT_NO|GENCAT_ND|GENCAT_NL))
326
327 #define caseconv(p,to) \\
328 int p(int c) \\
329 { \\
330         const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
331         if(rule==&nullrule) return c;\\
332         return c+rule->##to;\\
333 }
334
335 caseconv(u_towupper,updist)
336 caseconv(u_towlower,lowdist)
337 caseconv(u_towtitle,titledist)
338
339 int u_gencat(int c)
340 {
341         return getrule(allchars,NUM_BLOCKS,c)->catnumber;
342 }
343
344 EOF