1 \section[match]{PackedString functions for matching}
3 This module provides regular expression matching and substitution
4 at the PackedString level. It is built on top of the GNU Regex
5 library modified to handle perl regular expression syntax.
6 For a complete description of the perl syntax, do `man perlre`
7 or have a gander in (Programming|Learning) Perl. Here's
10 ^ matches the beginning of line
12 \b matches word boundary
13 \B matches non-word boundary
14 \w matches a word(alpha-numeric) character
15 \W matches a non-word character
17 \D matches a non-digit
19 \S matches non-whitespace
20 \A matches beginning of buffer
21 \Z matches end-of-buffer
22 . matches any (bar newline in single-line mode)
23 + matches 1 or more times
24 * matches 0 or more times
26 {n,m} matches >=n and <=m atoms
27 {n,} matches at least n times
29 [..] matches any character member of char class.
30 (..) if pattern inside parens match, then the ith group is bound
32 \digit matches whatever the ith group matched.
77 _tailPS and _dropPS in PS.lhs are not to my liking, use
82 _dropPS' x str = _substrPS str x (_lengthPS str)
88 _substrPS x 1 (_lengthPS x)
93 \subsection[ps-matching]{PackedString matching}
95 Posix matching, returning an array of the the intervals that
96 the individual groups matched within the string.
100 matchPS :: _PackedString -- reg. exp
101 -> _PackedString -- string to match
104 matchPS reg str flags
106 insensitive = 'i' `elem` flags
107 mode = 's' `elem` flags
109 unsafePerformPrimIO (
110 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
111 re_match pat str 0 True)
114 match2PS :: _PackedString -- reg. exp
115 -> _PackedString -- string1 to match
116 -> _PackedString -- string2 to match
119 match2PS reg str1 str2 flags
121 insensitive = 'i' `elem` flags
122 mode = 's' `elem` flags
123 len1 = _lengthPS str1
124 len2 = _lengthPS str2
126 unsafePerformPrimIO (
127 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
128 re_match2 pat str1 str2 0 (len1+len2) True)
132 PackedString front-end to searching with GNU Regex
136 searchPS :: _PackedString -- reg. exp
137 -> _PackedString -- string to match
140 searchPS reg str flags
142 insensitive = 'i' `elem` flags
143 mode = 's' `elem` flags
145 unsafePerformPrimIO (
146 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
154 search2PS :: _PackedString -- reg. exp
155 -> _PackedString -- string to match
156 -> _PackedString -- string to match
159 search2PS reg str1 str2 flags
161 insensitive = 'i' `elem` flags
162 mode = 's' `elem` flags
163 len1 = _lengthPS str1
164 len2 = _lengthPS str2
167 unsafePerformPrimIO (
168 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
181 @_substrPS s st end@ cuts out the chunk in \tr{s} between \tr{st} and \tr{end}, inclusive.
182 The \tr{Regex} registers represent substrings by storing the start and the end point plus
183 one( st==end => empty string) , so we use @chunkPS@ instead.
188 _chunkPS :: _PackedString
191 _chunkPS str (st,end)
195 _substrPS str st (max 0 (end-1))
199 Perl-like match and substitute
203 substPS :: _PackedString -- reg. exp
204 -> _PackedString -- replacement
206 -> _PackedString -- string
214 global = 'g' `elem` flags
215 case_insensitive = 'i' `elem` flags
216 mode = 's' `elem` flags -- single-line mode
217 pat = unsafePerformPrimIO (
218 re_compile_pattern rexp mode case_insensitive)
223 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
227 Just matcher@(REmatch arr before match after lst) ->
230 prefix = _chunkPS str before
232 = if global && (st /= en) then
233 search (_dropPS' en str)
238 replace matcher repl str,
246 replace (REmatch arr before@(_,b_end) match after lst)
249 = _concatPS (reverse acc) -- ToDo: write a `reversed' version of concatPS
253 acc = replace' [] replacement False
255 single :: Char -> _PackedString
256 single x = _consPS x _nilPS
258 replace' :: [_PackedString]
262 replace' acc repl escaped
263 = if (_nullPS repl) then
268 x# = case x of { C# c -> c }
276 replace' ((single x):acc) xs (not escaped)
278 if (not escaped) then
283 (num,xs_num) = getNumber ((ord x') - ord '0') xs'
285 if (isDigit x') && (num<=b) then
286 replace' ((_chunkPS str ith_ival):acc) xs_num escaped
287 else if x' == '&' then
288 replace' ((_chunkPS str match):acc) xs' escaped
289 else if x' == '+' then
290 replace' ((_chunkPS str lst):acc) xs' escaped
291 else if x' == '`' then
292 replace' ((_chunkPS str (0,b_end)):acc) xs' escaped
293 else if x' == '\'' then
294 replace' ((_chunkPS str after):acc) xs' escaped
296 replace' acc xs escaped
298 replace' ((single x):acc) xs False
303 replace' ((single '\n'):acc)
305 replace' ((single '\f'):acc)
306 'r'# -> -- carriage return
307 replace' ((single '\r'):acc)
308 't'# -> -- (horiz) tab
309 replace' ((single '\t'):acc)
310 'v'# -> -- vertical tab
311 replace' ((single '\v'):acc)
312 'a'# -> -- alarm bell
313 replace' ((single '\a'):acc)
315 replace' ((single '\033'):acc)
317 replace' ((single x):acc)) xs False
319 replace' ((single x):acc) xs False
322 getNumber :: Int -> _PackedString -> (Int,_PackedString)
332 getNumber (acc*10+(ord x - ord '0')) xs
338 Just like substPS, but no prefix and suffix.
342 replacePS :: _PackedString -- reg. exp
343 -> _PackedString -- replacement
345 -> _PackedString -- string
353 global = 'g' `elem` flags
354 case_insensitive = 'i' `elem` flags
355 mode = 's' `elem` flags -- single-line mode
356 pat = unsafePerformPrimIO (
357 re_compile_pattern rexp mode case_insensitive)
362 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
366 Just matcher@(REmatch arr before match after lst) ->
367 replace matcher repl str
371 Picking matched groups out of string
375 getMatchesNo :: REmatch
377 getMatchesNo (REmatch arr _ _ _ _)
380 getMatchedGroup :: REmatch
384 getMatchedGroup (REmatch arr bef mtch after lst) nth str
386 (1,grps) = bounds arr
388 if (nth >= 1) && (nth <= grps) then
389 _chunkPS str (arr!nth)
391 error "getMatchedGroup: group out of range"
393 getWholeMatch :: REmatch
396 getWholeMatch (REmatch _ _ mtch _ _) str
399 getLastMatch :: REmatch
402 getLastMatch (REmatch _ _ _ _ lst) str
405 getAfterMatch :: REmatch
408 getAfterMatch (REmatch _ _ _ aft _) str
414 More or less straight translation of a brute-force string matching
415 function written in C. (Sedgewick ch. 18)
417 This is intended to provide much the same facilities as index/rindex in perl.
422 findPS :: _PackedString
431 | j>=m || i>=n = if j==m then (Just (i-m)) else Nothing
436 = if j<m && i<n && (_indexPS str i /= _indexPS substr j) then
443 rfindPS :: _PackedString
448 m = _lengthPS substr - 1
449 n = _lengthPS str - 1
452 | j<0 || i<0 = if j<0 then (Just (i+1)) else Nothing
457 = if j>=0 && i>=0 && (_indexPS str i /= _indexPS substr j) then
458 inner_loop (i+(m-j)-1) m
469 chopPS :: _PackedString -> _PackedString
470 chopPS str = if _nullPS str then
473 _chunkPS str (0,_lengthPS str-1)
477 Tries to match as much as possible of strA starting from the beginning of strB
478 (handy when matching fancy literals in parsers)
481 matchPrefixPS :: _PackedString
484 matchPrefixPS pref str
485 = matchPrefixPS' pref str 0
487 matchPrefixPS' pref str n
488 = if (_nullPS pref) || (_nullPS str) then
490 else if (_headPS pref) == (_headPS str) then
491 matchPrefixPS' (_tailPS pref) (_tailPS str) (n+1)