1 \section[match]{PackedString functions for matching}
3 This module provides regular expression matching and substitution
4 at the PackedString level. It is built on top of the GNU Regex
5 library modified to handle perl regular expression syntax.
6 For a complete description of the perl syntax, do `man perlre`
7 or have a gander in (Programming|Learning) Perl. Here's
10 ^ matches the beginning of line
12 \b matches word boundary
13 \B matches non-word boundary
14 \w matches a word(alpha-numeric) character
15 \W matches a non-word character
17 \D matches a non-digit
19 \S matches non-whitespace
20 \A matches beginning of buffer
21 \Z matches end-of-buffer
22 . matches any (bar newline in single-line mode)
23 + matches 1 or more times
24 * matches 0 or more times
26 {n,m} matches >=n and <=m atoms
27 {n,} matches at least n times
29 [..] matches any character member of char class.
30 (..) if pattern inside parens match, then the ith group is bound
32 \digit matches whatever the ith group matched.
75 import Core -- alas ...
79 _tailPS and _dropPS in PS.lhs are not to my liking, use
84 _dropPS' x str = _substrPS str x (_lengthPS str)
90 _substrPS x 1 (_lengthPS x)
95 \subsection[ps-matching]{PackedString matching}
97 Posix matching, returning an array of the the intervals that
98 the individual groups matched within the string.
102 matchPS :: _PackedString -- reg. exp
103 -> _PackedString -- string to match
106 matchPS reg str flags
108 insensitive = 'i' `elem` flags
109 mode = 's' `elem` flags
111 unsafePerformPrimIO (
112 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
113 re_match pat str 0 True)
116 match2PS :: _PackedString -- reg. exp
117 -> _PackedString -- string1 to match
118 -> _PackedString -- string2 to match
121 match2PS reg str1 str2 flags
123 insensitive = 'i' `elem` flags
124 mode = 's' `elem` flags
125 len1 = _lengthPS str1
126 len2 = _lengthPS str2
128 unsafePerformPrimIO (
129 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
130 re_match2 pat str1 str2 0 (len1+len2) True)
134 PackedString front-end to searching with GNU Regex
138 searchPS :: _PackedString -- reg. exp
139 -> _PackedString -- string to match
142 searchPS reg str flags
144 insensitive = 'i' `elem` flags
145 mode = 's' `elem` flags
147 unsafePerformPrimIO (
148 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
156 search2PS :: _PackedString -- reg. exp
157 -> _PackedString -- string to match
158 -> _PackedString -- string to match
161 search2PS reg str1 str2 flags
163 insensitive = 'i' `elem` flags
164 mode = 's' `elem` flags
165 len1 = _lengthPS str1
166 len2 = _lengthPS str2
169 unsafePerformPrimIO (
170 re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
183 @_substrPS s st end@ cuts out the chunk in \tr{s} between \tr{st} and \tr{end}, inclusive.
184 The \tr{Regex} registers represent substrings by storing the start and the end point plus
185 one( st==end => empty string) , so we use @chunkPS@ instead.
190 _chunkPS :: _PackedString
193 _chunkPS str (st,end)
197 _substrPS str st (max 0 (end-1))
201 Perl-like match and substitute
205 substPS :: _PackedString -- reg. exp
206 -> _PackedString -- replacement
208 -> _PackedString -- string
216 global = 'g' `elem` flags
217 case_insensitive = 'i' `elem` flags
218 mode = 's' `elem` flags -- single-line mode
219 pat = unsafePerformPrimIO (
220 re_compile_pattern rexp mode case_insensitive)
225 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
229 Just matcher@(REmatch arr before match after lst) ->
232 prefix = _chunkPS str before
234 = if global && (st /= en) then
235 search (_dropPS' en str)
240 replace matcher repl str,
248 replace (REmatch arr before@(_,b_end) match after lst)
251 = _concatPS (reverse acc) -- ToDo: write a `reversed' version of concatPS
255 acc = replace' [] replacement False
257 single :: Char -> _PackedString
258 single x = _consPS x _nilPS
260 replace' :: [_PackedString]
264 replace' acc repl escaped
265 = if (_nullPS repl) then
270 x# = case x of { C# c -> c }
278 replace' ((single x):acc) xs (not escaped)
280 if (not escaped) then
285 (num,xs_num) = getNumber ((ord x') - ord '0') xs'
287 if (isDigit x') && (num<=b) then
288 replace' ((_chunkPS str ith_ival):acc) xs_num escaped
289 else if x' == '&' then
290 replace' ((_chunkPS str match):acc) xs' escaped
291 else if x' == '+' then
292 replace' ((_chunkPS str lst):acc) xs' escaped
293 else if x' == '`' then
294 replace' ((_chunkPS str (0,b_end)):acc) xs' escaped
295 else if x' == '\'' then
296 replace' ((_chunkPS str after):acc) xs' escaped
298 replace' acc xs escaped
300 replace' ((single x):acc) xs False
305 replace' ((single '\n'):acc)
307 replace' ((single '\f'):acc)
308 'r'# -> -- carriage return
309 replace' ((single '\r'):acc)
310 't'# -> -- (horiz) tab
311 replace' ((single '\t'):acc)
312 'v'# -> -- vertical tab
313 replace' ((single '\v'):acc)
314 'a'# -> -- alarm bell
315 replace' ((single '\a'):acc)
317 replace' ((single '\033'):acc)
319 replace' ((single x):acc)) xs False
321 replace' ((single x):acc) xs False
324 getNumber :: Int -> _PackedString -> (Int,_PackedString)
334 getNumber (acc*10+(ord x - ord '0')) xs
340 Just like substPS, but no prefix and suffix.
344 replacePS :: _PackedString -- reg. exp
345 -> _PackedString -- replacement
347 -> _PackedString -- string
355 global = 'g' `elem` flags
356 case_insensitive = 'i' `elem` flags
357 mode = 's' `elem` flags -- single-line mode
358 pat = unsafePerformPrimIO (
359 re_compile_pattern rexp mode case_insensitive)
364 = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
368 Just matcher@(REmatch arr before match after lst) ->
369 replace matcher repl str
373 Picking matched groups out of string
377 getMatchesNo :: REmatch
379 getMatchesNo (REmatch arr _ _ _ _)
382 getMatchedGroup :: REmatch
386 getMatchedGroup (REmatch arr bef mtch after lst) nth str
388 (1,grps) = bounds arr
390 if (nth >= 1) && (nth <= grps) then
391 _chunkPS str (arr!nth)
393 error "getMatchedGroup: group out of range"
395 getWholeMatch :: REmatch
398 getWholeMatch (REmatch _ _ mtch _ _) str
401 getLastMatch :: REmatch
404 getLastMatch (REmatch _ _ _ _ lst) str
407 getAfterMatch :: REmatch
410 getAfterMatch (REmatch _ _ _ aft _) str
416 More or less straight translation of a brute-force string matching
417 function written in C. (Sedgewick ch. 18)
419 This is intended to provide much the same facilities as index/rindex in perl.
424 findPS :: _PackedString
433 | j>=m || i>=n = if j==m then (Just (i-m)) else Nothing
438 = if j<m && i<n && (_indexPS str i /= _indexPS substr j) then
445 rfindPS :: _PackedString
450 m = _lengthPS substr - 1
451 n = _lengthPS str - 1
454 | j<0 || i<0 = if j<0 then (Just (i+1)) else Nothing
459 = if j>=0 && i>=0 && (_indexPS str i /= _indexPS substr j) then
460 inner_loop (i+(m-j)-1) m
471 chopPS :: _PackedString -> _PackedString
472 chopPS str = if _nullPS str then
475 _chunkPS str (0,_lengthPS str-1)
479 Tries to match as much as possible of strA starting from the beginning of strB
480 (handy when matching fancy literals in parsers)
483 matchPrefixPS :: _PackedString
486 matchPrefixPS pref str
487 = matchPrefixPS' pref str 0
489 matchPrefixPS' pref str n
490 = if (_nullPS pref) || (_nullPS str) then
492 else if (_headPS pref) == (_headPS str) then
493 matchPrefixPS' (_tailPS pref) (_tailPS str) (n+1)