3 # --------------------------------------------------------------------------
4 # This is the script to create the unicode chars property table
5 # Written by Dimitry Golubovsky (dimitry@golubovsky.org) as part
6 # of the Partial Unicode Support patch
8 # Adopted for use with GHC.
9 # License: see libraries/base/LICENSE
11 # -------------------------------------------------------------------------
13 # The script reads the file from the standard input,
14 # and outputs C code into the standard output.
15 # The C code contains the chars property table, and basic functions
16 # to access properties.
18 # Output the file header
20 echo "/*-------------------------------------------------------------------------"
21 echo "This is an automatically generated file: do not edit"
22 echo "Generated by `basename $0` at `date`"
23 echo "-------------------------------------------------------------------------*/"
25 echo "#include \"WCsubst.h\""
31 /* Unicode general categories, listed in the same order as in the Unicode
32 * standard -- this must be the same order as in GHC.Unicode.
36 NUMCAT_LU, /* Letter, Uppercase */
37 NUMCAT_LL, /* Letter, Lowercase */
38 NUMCAT_LT, /* Letter, Titlecase */
39 NUMCAT_LM, /* Letter, Modifier */
40 NUMCAT_LO, /* Letter, Other */
41 NUMCAT_MN, /* Mark, Non-Spacing */
42 NUMCAT_MC, /* Mark, Spacing Combining */
43 NUMCAT_ME, /* Mark, Enclosing */
44 NUMCAT_ND, /* Number, Decimal */
45 NUMCAT_NL, /* Number, Letter */
46 NUMCAT_NO, /* Number, Other */
47 NUMCAT_PC, /* Punctuation, Connector */
48 NUMCAT_PD, /* Punctuation, Dash */
49 NUMCAT_PS, /* Punctuation, Open */
50 NUMCAT_PE, /* Punctuation, Close */
51 NUMCAT_PI, /* Punctuation, Initial quote */
52 NUMCAT_PF, /* Punctuation, Final quote */
53 NUMCAT_PO, /* Punctuation, Other */
54 NUMCAT_SM, /* Symbol, Math */
55 NUMCAT_SC, /* Symbol, Currency */
56 NUMCAT_SK, /* Symbol, Modifier */
57 NUMCAT_SO, /* Symbol, Other */
58 NUMCAT_ZS, /* Separator, Space */
59 NUMCAT_ZL, /* Separator, Line */
60 NUMCAT_ZP, /* Separator, Paragraph */
61 NUMCAT_CC, /* Other, Control */
62 NUMCAT_CF, /* Other, Format */
63 NUMCAT_CS, /* Other, Surrogate */
64 NUMCAT_CO, /* Other, Private Use */
65 NUMCAT_CN /* Other, Not Assigned */
70 unsigned int category;
71 unsigned int catnumber;
82 const struct _convrule_ *rule;
87 # Convert the stdin file to the C table
99 digs="0123456789ABCDEF"
102 hex[substr(digs,i+1,1)]=i;
107 if(a=="") return "-1"
116 acc=acc*16+hex[substr(a,i,1)];
122 blkd=blockb ", " blockl ", &rule" rules[blockr]
123 blocks[blockidx]=blkd
125 if(blockb<=256) lat1idx++
126 split(blockr,rsp,",")
127 if(substr(rsp[3],2,1)=="1")
129 cblcks[cblckidx]=blkd
132 if(rsp[1]=="GENCAT_ZS")
134 sblcks[sblckidx]=blkd
149 if((up==0)&&(low==0)&&(title==0)) convpos=0
152 if(title==0) title=self
156 rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
157 ("1, " updist ", " lowdist ", " titledist):
161 cats[cat]=(2^catidx);
177 if (index(name,"First>")!=0)
181 else if (index(name,"Last>")!=0)
183 blockl+=(self-blockb)
185 else if((self==blockb+blockl)&&(rule==blockr)) blockl++
194 for(c in cats) print "#define GENCAT_"c" "cats[c]
195 print "#define MAX_UNI_CHAR " self
196 print "#define NUM_BLOCKS " blockidx
197 print "#define NUM_CONVBLOCKS " cblckidx
198 print "#define NUM_SPACEBLOCKS " sblckidx
199 print "#define NUM_LAT1BLOCKS " lat1idx
200 print "#define NUM_RULES " rulidx
203 printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
205 print "static const struct _charblock_ allchars[]={"
206 for(i=0;i<blockidx;i++)
208 printf "\t{" blocks[i] "}"
209 print (i<(blockidx-1))?",":""
212 print "static const struct _charblock_ convchars[]={"
213 for(i=0;i<cblckidx;i++)
215 printf "\t{" cblcks[i] "}"
216 print (i<(cblckidx-1))?",":""
219 print "static const struct _charblock_ spacechars[]={"
220 for(i=0;i<sblckidx;i++)
222 printf "\t{" sblcks[i] "}"
223 print (i<(sblckidx-1))?",":""
228 # Output the C procedures code
233 Obtain the reference to character rule by doing
234 binary search over the specified array of blocks.
235 To make checkattr shorter, the address of
236 nullrule is returned if the search fails:
237 this rule defines no category and no conversion
238 distances. The compare function returns 0 when
239 key->start is within the block. Otherwise
240 result of comparison of key->start and start of the
241 current block is returned as usual.
244 static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};
246 int blkcmp(const void *vk,const void *vb)
248 const struct _charblock_ *key,*cur;
251 if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
255 if(key->start>cur->start) return 1;
259 static const struct _convrule_ *getrule(
260 const struct _charblock_ *blocks,
264 struct _charblock_ key={unichar,1,(void *)0};
265 struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
266 if(cb==(void *)0) return &nullrule;
273 Check whether a character (internal code) has certain attributes.
274 Attributes (category flags) may be ORed. The function ANDs
275 character category flags and the mask and returns the result.
276 If the character belongs to one of the categories requested,
277 the result will be nonzero.
280 inline static int checkattr(int c,unsigned int catmask)
282 return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
285 inline static int checkattr_s(int c,unsigned int catmask)
287 return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
291 Define predicate functions for some combinations of categories.
294 #define unipred(p,m) \\
297 return checkattr(c,m); \\
300 #define unipred_s(p,m) \\
303 return checkattr_s(c,m); \\
307 Make these rules as close to Hugs as possible.
310 unipred(u_iswcntrl,GENCAT_CC)
311 unipred(u_iswprint, \
312 (GENCAT_MC | GENCAT_NO | GENCAT_SK | GENCAT_ME | GENCAT_ND | \
313 GENCAT_PO | GENCAT_LT | GENCAT_PC | GENCAT_SM | GENCAT_ZS | \
314 GENCAT_LU | GENCAT_PD | GENCAT_SO | GENCAT_PE | GENCAT_PF | \
315 GENCAT_PS | GENCAT_SC | GENCAT_LL | GENCAT_LM | GENCAT_PI | \
316 GENCAT_NL | GENCAT_MN | GENCAT_LO))
317 unipred_s(u_iswspace,GENCAT_ZS)
318 unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
319 unipred(u_iswlower,GENCAT_LL)
320 unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
321 unipred(u_iswdigit,GENCAT_ND)
323 unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
324 GENCAT_MC|GENCAT_ME|GENCAT_MN|
325 GENCAT_NO|GENCAT_ND|GENCAT_NL))
327 #define caseconv(p,to) \\
330 const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
331 if(rule==&nullrule) return c;\\
332 return c+rule->##to;\\
335 caseconv(u_towupper,updist)
336 caseconv(u_towlower,lowdist)
337 caseconv(u_towtitle,titledist)
341 return getrule(allchars,NUM_BLOCKS,c)->catnumber;