1 \section[regex]{Haskell binding to the GNU regex library}
3 What follows is a straightforward binding to the functions
4 provided by the GNU regex library (the GNU group of functions with Perl
26 First, the higher level matching structure that the functions herein return:
31 -- GroupBounds hold the interval where a group
32 -- matched inside a string, e.g.
34 -- matching "reg(exp)" "a regexp" returns the pair (5,7) for the
35 -- (exp) group. (_PackedString indices start from 0)
38 type GroupBounds = (Int, Int)
42 = REmatch (Array Int GroupBounds) -- for $1, ... $n
43 GroupBounds -- for $` (everything before match)
44 GroupBounds -- for $& (entire matched string)
45 GroupBounds -- for $' (everything after)
46 GroupBounds -- for $+ (matched by last bracket)
47 {- debugging deriving Text -}
51 Prior to any matching (or searching), the regular expression
52 have to compiled into an internal form, the pattern buffer.
53 Represent the pattern buffer as a Haskell heap object:
58 data PatBuffer = PatBuffer# (MutableByteArray# _RealWorld)
59 instance _CCallable PatBuffer
60 instance _CReturnable PatBuffer
62 createPatBuffer :: Bool
64 createPatBuffer insensitive
65 = _casm_ `` %r = (int)sizeof(struct re_pattern_buffer); '' `thenPrimIO` \ sz ->
66 newCharArray (0,sz) `thenPrimIO` \ (_MutableByteArray _ pbuf#) ->
68 pbuf = PatBuffer# pbuf#
72 See comment re: fastmap below
74 ((_casm_ `` %r = (char *)malloc(256*sizeof(char)); '')::PrimIO _Addr) `thenPrimIO` \ tmap ->
76 Set up the translate table so that any lowercase
77 char. gets mapped to an uppercase one. Beacuse quoting
78 inside CAsmStrings is Problematic, we pass in the ordinal values
84 ((char *)%0)[i] = (char)i;
85 for(i=(int)%1;i <=(int)%2;i++)
86 ((char *)%0)[i] = i - ((int)%1 - (int)%3);
87 %r = 0; } '' tmap (ord 'a') (ord 'z') (ord 'A') `seqPrimIO`
88 _casm_ `` { ((struct re_pattern_buffer *)%0)->translate = %1; %r = 0; } '' pbuf tmap
90 _casm_ `` { ((struct re_pattern_buffer *)%0)->translate = 0; %r = 0; } '' pbuf) `seqPrimIO`
92 Use a fastmap to speed things up, would like to have the fastmap
93 in the Haskell heap, but it will get GCed before we can say regexp,
94 as the reference to it is buried inside a ByteArray :-(
96 ((_casm_ `` %r = (char *)malloc(256*sizeof(char)); '')::PrimIO _Addr) `thenPrimIO` \ fmap ->
97 _casm_ `` { ((struct re_pattern_buffer *)%0)->fastmap = %1; %r = 0; } '' pbuf fmap `seqPrimIO`
99 We want the compiler of the pattern to alloc. memory
102 _casm_ `` { ((struct re_pattern_buffer *)%0)->buffer = 0; %r = 0;} '' pbuf `seqPrimIO`
103 _casm_ `` { ((struct re_pattern_buffer *)%0)->allocated = 0; %r = 0;} '' pbuf `seqPrimIO`
108 @re_compile_pattern@ converts a regular expression into a pattern buffer,
111 Q: should we lift the syntax bits configuration up to the Haskell
116 re_compile_pattern :: _PackedString
120 re_compile_pattern str single_line_mode insensitive
121 = createPatBuffer insensitive `thenPrimIO` \ pbuf ->
122 (if single_line_mode then -- match a multi-line buffer
123 _casm_ `` %r = re_syntax_options = RE_PERL_SINGLELINE_SYNTAX; ''
125 _casm_ `` %r = re_syntax_options = RE_PERL_MULTILINE_SYNTAX; '') `seqPrimIO`
127 _casm_ `` %r=(int)re_compile_pattern((char *)%0,
129 (struct re_pattern_buffer *)%2); '' (_unpackPS str)
131 pbuf `thenPrimIO` \ err ->
133 -- No checking for how the compilation of the pattern went yet.
143 re_match :: PatBuffer
147 -> PrimIO (Maybe REmatch)
152 = ((if reg then -- record result of match in registers
153 _casm_ `` %r = (struct re_registers *)malloc(sizeof(struct re_registers *)); ''
155 _casm_ `` %r = (struct re_registers *)NULL; '')::PrimIO _Addr) `thenPrimIO` \ regs ->
156 _casm_ `` %r=(int)re_match((struct re_pattern_buffer *)%0,
160 (struct re_registers *)%4); '' pbuf
164 regs `thenPrimIO` \ match_res ->
165 if match_res == (-2) then
166 error "re_match: Internal error"
167 else if match_res < 0 then
168 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
171 build_re_match start (_lengthPS str) regs `thenPrimIO` \ arr ->
172 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
173 returnPrimIO (Just arr)
177 Matching on 2 strings is useful when you're dealing with multiple
178 buffers, which is something that could prove useful for PackedStrings,
179 as we don't want to stuff the contents of a file into one massive heap
180 chunk, but load (smaller chunks) on demand.
184 re_match2 :: PatBuffer
190 -> PrimIO (Maybe REmatch)
197 = ((if reg then -- record result of match in registers
198 _casm_ `` %r = (struct re_registers *)malloc(sizeof(struct re_registers *)); ''
200 _casm_ `` %r = (struct re_registers *)NULL; '')::PrimIO _Addr) `thenPrimIO` \ regs ->
201 _casm_ `` %r=(int)re_match_2((struct re_pattern_buffer *)%0,
207 (struct re_registers *)%6,
215 stop `thenPrimIO` \ match_res ->
216 if match_res == (-2) then
217 error "re_match2: Internal error"
218 else if match_res < 0 then
219 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
222 build_re_match start stop regs `thenPrimIO` \ arr ->
223 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
224 returnPrimIO (Just arr)
229 Find all the matches in a string.
233 re_search :: PatBuffer
238 -> PrimIO (Maybe REmatch)
239 re_search pbuf -- the compiled regexp
240 str -- the string to search
243 reg -- record result of match in registers
244 = (if reg then -- record result of match in registers
245 _casm_ `` %r = (struct re_registers *)malloc(sizeof(struct re_registers *)); ''
247 _casm_ `` %r = (struct re_registers *)NULL; '') `thenPrimIO` \ regs ->
248 _casm_ `` %r=(int)re_search((struct re_pattern_buffer *)%0,
253 (struct re_registers *)%5); '' pbuf
258 regs `thenPrimIO` \ match_res ->
259 if match_res== (-1) then
260 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
264 (st,en) = if range > start then
269 build_re_match st en regs `thenPrimIO` \ arr ->
270 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
271 returnPrimIO (Just arr)
279 re_search2 :: PatBuffer
286 -> PrimIO (Maybe REmatch)
294 = (if reg then -- record result of match in registers
295 _casm_ `` %r = (struct re_registers *)malloc(sizeof(struct re_registers *)); ''
297 _casm_ `` %r = (struct re_registers *)NULL; '') `thenPrimIO` \ regs ->
298 _casm_ `` %r=(int)re_search_2((struct re_pattern_buffer *)%0,
305 (struct re_registers *)%7,
314 stop `thenPrimIO` \ match_res ->
315 if match_res== (-1) then
316 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
320 (st,en) = if range > start then
325 build_re_match st en regs `thenPrimIO` \ arr ->
326 _casm_ `` { free((struct re_registers *)%0); %r = 0; } '' regs `seqPrimIO`
327 returnPrimIO (Just arr)
333 build_re_match :: Int
337 build_re_match str_start
340 = _casm_ `` %r=(int)(*(struct re_registers *)%0).num_regs; '' regs `thenPrimIO` \ len ->
341 match_reg_to_array regs len `thenPrimIO` \ (match_start,match_end,arr) ->
345 bef = (str_start,match_start) -- $'
346 aft = (match_end,str_end) -- $`
348 mtch = (match_start,match_end) -- $&
350 returnPrimIO (REmatch arr
356 match_reg_to_array regs len
357 = trundleIO regs (0,[]) len `thenPrimIO` \ (no,ls) ->
363 ((a,b):xs) -> (a,b,xs)
368 array (1,max 1 (no-1))
369 [ i := x | (i,x) <- zip [1..] ls'])
374 -> PrimIO (Int,[(Int,Int)])
375 trundleIO regs (i,acc) len
376 | i==len = returnPrimIO (i,reverse acc)
378 = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1]; '' regs i `thenPrimIO` \ start ->
379 _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1]; '' regs i `thenPrimIO` \ end ->
381 acc' = (start,end):acc
383 if (start == (-1)) && (end == (-1)) then
384 returnPrimIO (i,reverse acc)
386 trundleIO regs (i+1,acc') len