1 \section[match]{PackedString functions for matching}
3 This module provides regular expression matching and substitution
4 at the PackedString level. It is built on top of the GNU Regex
5 library modified to handle perl regular expression syntax.
6 For a complete description of the perl syntax, do `man perlre`
7 or have a gander in (Programming|Learning) Perl. Here's
10 ^ matches the beginning of line
12 \b matches word boundary
13 \B matches non-word boundary
14 \w matches a word(alpha-numeric) character
15 \W matches a non-word character
17 \D matches a non-digit
19 \S matches non-whitespace
20 \A matches beginning of buffer
21 \Z matches end-of-buffer
22 . matches any (bar newline in single-line mode)
23 + matches 1 or more times
24 * matches 0 or more times
26 {n,m} matches >=n and <=m atoms
27 {n,} matches at least n times
29 [..] matches any character member of char class.
30 (..) if pattern inside parens match, then the ith group is bound
32 \digit matches whatever the ith group matched.
77 _tailPS and _dropPS in PS.lhs are not to my liking, use
82 _dropPS' x str = _substrPS str x (_lengthPS str)
88 _substrPS x 1 (_lengthPS x)
93 \subsection[ps-matching]{PackedString matching}
95 Posix matching, returning an array of the the intervals that
96 the individual groups matched within the string.
100 matchPS :: _PackedString -- reg. exp
101 -> _PackedString -- string to match
104 matchPS reg str flags
106 insensitive = 'i' `elem` flags
107 mode = 's' `elem` flags
109 unsafePerformPrimIO (
110 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
111 re_match pat str 0 True)
114 match2PS :: _PackedString -- reg. exp
115 -> _PackedString -- string1 to match
116 -> _PackedString -- string2 to match
119 match2PS reg str1 str2 flags
121 insensitive = 'i' `elem` flags
122 mode = 's' `elem` flags
123 len1 = _lengthPS str1
124 len2 = _lengthPS str2
126 unsafePerformPrimIO (
127 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
128 re_match2 pat str1 str2 0 (len1+len2) True)
132 PackedString front-end to searching with GNU Regex
136 searchPS :: _PackedString -- reg. exp
137 -> _PackedString -- string to match
140 searchPS reg str flags
142 insensitive = 'i' `elem` flags
143 mode = 's' `elem` flags
145 unsafePerformPrimIO (
146 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
154 search2PS :: _PackedString -- reg. exp
155 -> _PackedString -- string to match
156 -> _PackedString -- string to match
159 search2PS reg str1 str2 flags
161 insensitive = 'i' `elem` flags
162 mode = 's' `elem` flags
163 len1 = _lengthPS str1
164 len2 = _lengthPS str2
167 unsafePerformPrimIO (
168 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
181 @_substrPS s st end@ cuts out the chunk in \tr{s} between \tr{st} and \tr{end}, inclusive.
182 The \tr{Regex} registers represent substrings by storing the start and the end point plus
183 one( st==end => empty string) , so we use @chunkPS@ instead.
188 _chunkPS :: _PackedString
191 _chunkPS str (st,end)
195 _substrPS str st (max 0 (end-1))
199 Perl-like match and substitute
203 substPS :: _PackedString -- reg. exp
204 -> _PackedString -- replacement
206 -> _PackedString -- string
214 global = 'g' `elem` flags
215 case_insensitive = 'i' `elem` flags
216 mode = 's' `elem` flags -- single-line mode
217 pat = unsafePerformPrimIO (
218 re_compile_pattern rexp mode case_insensitive)
223 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
227 Just matcher@(REmatch arr before match after lst) ->
230 prefix = _chunkPS str before
232 = if global && (st /= en) then
233 search (_dropPS' en str)
238 replace matcher repl str,
246 replace (REmatch arr before@(_,b_end) match after lst)
249 = _concatPS (reverse acc) -- ToDo: write a `reversed' version of concatPS
253 acc = replace' [] replacement False
255 single :: Char -> _PackedString
256 single x = _consPS x _nilPS
258 replace' :: [_PackedString]
262 replace' acc repl escaped
263 = if (_nullPS repl) then
267 x@(C# x#) = _headPS repl
275 replace' ((single x):acc) xs (not escaped)
277 if (not escaped) then
282 (num,xs_num) = getNumber ((ord x') - ord '0') xs'
284 if (isDigit x') && (num<=b) then
285 replace' ((_chunkPS str ith_ival):acc) xs_num escaped
286 else if x' == '&' then
287 replace' ((_chunkPS str match):acc) xs' escaped
288 else if x' == '+' then
289 replace' ((_chunkPS str lst):acc) xs' escaped
290 else if x' == '`' then
291 replace' ((_chunkPS str (0,b_end)):acc) xs' escaped
292 else if x' == '\'' then
293 replace' ((_chunkPS str after):acc) xs' escaped
295 replace' acc xs escaped
297 replace' ((single x):acc) xs False
302 replace' ((single '\n'):acc)
304 replace' ((single '\f'):acc)
305 'r'# -> -- carriage return
306 replace' ((single '\r'):acc)
307 't'# -> -- (horiz) tab
308 replace' ((single '\t'):acc)
309 'v'# -> -- vertical tab
310 replace' ((single '\v'):acc)
311 'a'# -> -- alarm bell
312 replace' ((single '\a'):acc)
314 replace' ((single '\033'):acc)
316 replace' ((single x):acc)) xs False
318 replace' ((single x):acc) xs False
321 getNumber :: Int -> _PackedString -> (Int,_PackedString)
331 getNumber (acc*10+(ord x - ord '0')) xs
337 Just like substPS, but no prefix and suffix.
341 replacePS :: _PackedString -- reg. exp
342 -> _PackedString -- replacement
344 -> _PackedString -- string
352 global = 'g' `elem` flags
353 case_insensitive = 'i' `elem` flags
354 mode = 's' `elem` flags -- single-line mode
355 pat = unsafePerformPrimIO (
356 re_compile_pattern rexp mode case_insensitive)
361 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
365 Just matcher@(REmatch arr before match after lst) ->
366 replace matcher repl str
370 Picking matched groups out of string
374 getMatchesNo :: REmatch
376 getMatchesNo (REmatch arr _ _ _ _)
379 getMatchedGroup :: REmatch
383 getMatchedGroup (REmatch arr bef mtch after lst) nth str
385 (1,grps) = bounds arr
387 if (nth >= 1) && (nth <= grps) then
388 _chunkPS str (arr!nth)
390 error "getMatchedGroup: group out of range"
392 getWholeMatch :: REmatch
395 getWholeMatch (REmatch _ _ mtch _ _) str
398 getLastMatch :: REmatch
401 getLastMatch (REmatch _ _ _ _ lst) str
404 getAfterMatch :: REmatch
407 getAfterMatch (REmatch _ _ _ aft _) str
413 More or less straight translation of a brute-force string matching
414 function written in C. (Sedgewick ch. 18)
416 This is intended to provide much the same facilities as index/rindex in perl.
421 findPS :: _PackedString
430 | j>=m || i>=n = if j==m then (Just (i-m)) else Nothing
435 = if j<m && i<n && (_indexPS str i /= _indexPS substr j) then
442 rfindPS :: _PackedString
447 m = _lengthPS substr - 1
448 n = _lengthPS str - 1
451 | j<0 || i<0 = if j<0 then (Just (i+1)) else Nothing
456 = if j>=0 && i>=0 && (_indexPS str i /= _indexPS substr j) then
457 inner_loop (i+(m-j)-1) m
468 chopPS :: _PackedString -> _PackedString
469 chopPS str = if _nullPS str then
472 _chunkPS str (0,_lengthPS str-1)
476 Tries to match as much as possible of strA starting from the beginning of strB
477 (handy when matching fancy literals in parsers)
480 matchPrefixPS :: _PackedString
483 matchPrefixPS pref str
484 = matchPrefixPS' pref str 0
486 matchPrefixPS' pref str n
487 = if (_nullPS pref) || (_nullPS str) then
489 else if (_headPS pref) == (_headPS str) then
490 matchPrefixPS' (_tailPS pref) (_tailPS str) (n+1)