1 \section[regex]{Haskell binding to the GNU regex library}
3 What follows is a straightforward binding to the functions
4 provided by the GNU regex library (the GNU group of functions with Perl
8 {-# OPTIONS -#include "cbits/ghcRegex.h" #-}
24 import Array ( array, bounds, (!) )
25 import PrelArr ( MutableByteArray(..), Array(..) )
26 import PrelGHC ( MutableByteArray# )
32 First, the higher level matching structure that the functions herein
36 -- GroupBounds hold the interval where a group
37 -- matched inside a string, e.g.
39 -- matching "reg(exp)" "a regexp" returns the pair (5,7) for the
40 -- (exp) group. (PackedString indices start from 0)
42 type GroupBounds = (Int, Int)
45 = REmatch (Array Int GroupBounds) -- for $1, ... $n
46 GroupBounds -- for $` (everything before match)
47 GroupBounds -- for $& (entire matched string)
48 GroupBounds -- for $' (everything after)
49 GroupBounds -- for $+ (matched by last bracket)
52 Prior to any matching (or searching), the regular expression
53 have to compiled into an internal form, the pattern buffer.
54 Represent the pattern buffer as a Haskell heap object:
57 data PatBuffer = PatBuffer# (MutableByteArray# RealWorld)
58 instance CCallable PatBuffer
59 instance CReturnable PatBuffer
61 createPatBuffer :: Bool -> IO PatBuffer
63 createPatBuffer insensitive
64 = _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz ->
65 stToIO (newCharArray (0,sz)) >>= \ (MutableByteArray _ pbuf#) ->
67 pbuf = PatBuffer# pbuf#
71 See comment re: fastmap below
73 ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap ->
75 Set up the translate table so that any lowercase
76 char. gets mapped to an uppercase one. Beacuse quoting
77 inside CAsmStrings is Problematic, we pass in the ordinal values
83 ((char *)%0)[i] = (char)i;
84 for(i=(int)%1;i <=(int)%2;i++)
85 ((char *)%0)[i] = i - ((int)%1 - (int)%3);
86 }'' tmap (ord 'a') (ord 'z') (ord 'A') >>
87 _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap
89 _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >>
91 Use a fastmap to speed things up, would like to have the fastmap
92 in the Haskell heap, but it will get GCed before we can say regexp,
93 as the reference to it is buried inside a ByteArray :-(
95 ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap ->
96 _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap = %1; '' pbuf fmap >>
98 We want the compiler of the pattern to alloc. memory
101 _casm_ `` ((struct re_pattern_buffer *)%0)->buffer = 0; '' pbuf >>
102 _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >>
106 @re_compile_pattern@ converts a regular expression into a pattern buffer,
109 Q: should we lift the syntax bits configuration up to the Haskell
113 re_compile_pattern :: PackedString -- pattern to compile
114 -> Bool -- True <=> assume single-line mode
115 -> Bool -- True <=> case-insensitive
118 re_compile_pattern str single_line_mode insensitive
119 = createPatBuffer insensitive >>= \ pbuf ->
120 (if single_line_mode then -- match a multi-line buffer
121 _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;''
123 _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >>
125 _casm_ `` (int)re_compile_pattern((char *)%0,
127 (struct re_pattern_buffer *)%2);''
128 (unpackPS str) (lengthPS str) pbuf >>= \ () ->
130 -- No checking for how the compilation of the pattern went yet.
137 Each call to re_match uses a new re_registers structures, so we need
138 to ask the regex library to allocate enough memory to store the
139 registers in each time. That's what the line '... REGS_UNALLOCATED'
143 re_match :: PatBuffer -- compiled regexp
144 -> PackedString -- string to match
145 -> Int -- start position
146 -> Bool -- True <=> record results in registers
147 -> IO (Maybe REmatch)
149 re_match pbuf str start reg
150 = ((if reg then -- record result of match in registers
151 _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
153 _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
154 _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED;
155 %r=(int)re_match((struct re_pattern_buffer *)%0,
159 (struct re_registers *)%4);'' pbuf
163 regs >>= \ match_res ->
164 if match_res == (-2) then
165 error "re_match: Internal error"
166 else if match_res < 0 then
167 _casm_ ``free((struct re_registers *)%0); '' regs >>
170 build_re_match start (lengthPS str) regs >>= \ arr ->
171 _casm_ ``free(((struct re_registers *)%0)->start);
172 free(((struct re_registers *)%0)->end);
173 free((struct re_registers *)%0); '' regs >>
177 Matching on 2 strings is useful when you're dealing with multiple
178 buffers, which is something that could prove useful for PackedStrings,
179 as we don't want to stuff the contents of a file into one massive heap
180 chunk, but load (smaller chunks) on demand.
183 re_match2 :: PatBuffer
189 -> IO (Maybe REmatch)
191 re_match2 pbuf str1 str2 start stop reg
192 = ((if reg then -- record result of match in registers
193 _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
195 _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
196 _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0,
202 (struct re_registers *)%6,
210 stop >>= \ match_res ->
211 if match_res == (-2) then
212 error "re_match2: Internal error"
213 else if match_res < 0 then
214 _casm_ ``free((struct re_registers *)%0); '' regs >>
217 build_re_match start stop regs >>= \ arr ->
218 _casm_ ``free((struct re_registers *)%0); '' regs >>
222 Find all the matches in a string:
224 re_search :: PatBuffer -- the compiled regexp
225 -> PackedString -- the string to search
226 -> Int -- start index
228 -> Bool -- record result of match in registers
229 -> IO (Maybe REmatch)
231 re_search pbuf str start range reg
232 = (if reg then -- record result of match in registers
233 _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
235 _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
236 _casm_ ``%r=(int)re_search((struct re_pattern_buffer *)%0,
241 (struct re_registers *)%5);'' pbuf
246 regs >>= \ match_res ->
247 if match_res== (-1) then
248 _casm_ `` free((struct re_registers *)%0); '' regs >>
252 (st,en) = if range > start then
257 build_re_match st en regs >>= \ arr ->
258 _casm_ ``free((struct re_registers *)%0); '' regs >>
262 Double buffer search:
264 re_search2 :: PatBuffer
271 -> IO (Maybe REmatch)
273 re_search2 pbuf str1 str2 start range stop reg
275 = (if reg then -- record result of match in registers
276 _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
278 _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
279 _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0,
286 (struct re_registers *)%7,
295 stop >>= \ match_res ->
296 if match_res== (-1) then
297 _casm_ `` free((struct re_registers *)%0); '' regs >>
301 (st,en) = if range > start then
306 build_re_match st en regs >>= \ arr ->
307 _casm_ `` free((struct re_registers *)%0); '' regs >>
312 build_re_match :: Int
317 build_re_match str_start str_end regs
318 = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs >>= \ len ->
319 match_reg_to_array regs len >>= \ (match_start,match_end,arr) ->
323 bef = (str_start,match_start) -- $'
324 aft = (match_end,str_end) -- $`
326 mtch = (match_start,match_end) -- $&
334 match_reg_to_array regs len
335 = trundleIO regs (0,[]) len >>= \ (no,ls) ->
341 ((a,b):xs) -> (a,b,xs)
346 array (1,max 1 (no-1))
347 [ (i, x) | (i,x) <- zip [1..] ls'])
352 -> IO (Int,[(Int,Int)])
354 trundleIO regs (i,acc) len
355 | i==len = return (i,reverse acc)
357 = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' regs i >>= \ start ->
358 _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];'' regs i >>= \ end ->
360 acc' = (start,end):acc
362 if (start == (-1)) && (end == (-1)) then
363 return (i,reverse acc)
365 trundleIO regs (i+1,acc') len