--- /dev/null
+\section[match]{PackedString functions for matching}
+
+This module provides regular expression matching and substitution
+at the PackedString level. It is built on top of the GNU Regex
+library modified to handle perl regular expression syntax.
+For a complete description of the perl syntax, do `man perlre`
+or have a gander in (Programming|Learning) Perl. Here's
+a short summary:
+
+^ matches the beginning of line
+$ matches end of line
+\b matches word boundary
+\B matches non-word boundary
+\w matches a word(alpha-numeric) character
+\W matches a non-word character
+\d matches a digit
+\D matches a non-digit
+\s matches whitespace
+\S matches non-whitespace
+\A matches beginning of buffer
+\Z matches end-of-buffer
+. matches any (bar newline in single-line mode)
++ matches 1 or more times
+* matches 0 or more times
+? matches 0 or 1
+{n,m} matches >=n and <=m atoms
+{n,} matches at least n times
+{n} matches n times
+[..] matches any character member of char class.
+(..) if pattern inside parens match, then the ith group is bound
+ to the matched string
+\digit matches whatever the ith group matched.
+
+Backslashed letters
+\n newline
+\r carriage return
+\t tab
+\f formfeed
+\v vertical tab
+\a alarm bell
+\e escape
+
+
+\begin{code}
+module MatchPS
+
+ (
+ matchPS,
+ searchPS,
+ substPS,
+ replacePS,
+
+ match2PS,
+ search2PS,
+
+ getMatchesNo,
+ getMatchedGroup,
+ getWholeMatch,
+ getLastMatch,
+ getAfterMatch,
+
+ findPS,
+ rfindPS,
+ chopPS,
+
+ matchPrefixPS,
+
+ REmatch(..)
+ ) where
+
+import PreludeGlaST
+
+import Regex
+
+\end{code}
+
+_tailPS and _dropPS in PS.lhs are not to my liking, use
+these instead.
+
+\begin{code}
+
+_dropPS' x str = _substrPS str x (_lengthPS str)
+
+_tailPS' x
+ = if _nullPS x then
+ error "_tailPS []"
+ else
+ _substrPS x 1 (_lengthPS x)
+
+
+\end{code}
+
+\subsection[ps-matching]{PackedString matching}
+
+Posix matching, returning an array of the the intervals that
+the individual groups matched within the string.
+
+\begin{code}
+
+matchPS :: _PackedString -- reg. exp
+ -> _PackedString -- string to match
+ -> [Char] -- flags
+ -> Maybe REmatch
+matchPS reg str flags
+ = let
+ insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags
+ in
+ unsafePerformPrimIO (
+ re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
+ re_match pat str 0 True)
+
+
+match2PS :: _PackedString -- reg. exp
+ -> _PackedString -- string1 to match
+ -> _PackedString -- string2 to match
+ -> [Char] -- flags
+ -> Maybe REmatch
+match2PS reg str1 str2 flags
+ = let
+ insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags
+ len1 = _lengthPS str1
+ len2 = _lengthPS str2
+ in
+ unsafePerformPrimIO (
+ re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
+ re_match2 pat str1 str2 0 (len1+len2) True)
+
+\end{code}
+
+PackedString front-end to searching with GNU Regex
+
+\begin{code}
+
+searchPS :: _PackedString -- reg. exp
+ -> _PackedString -- string to match
+ -> [Char] -- flags
+ -> Maybe REmatch
+searchPS reg str flags
+ = let
+ insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags
+ in
+ unsafePerformPrimIO (
+ re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
+ re_search pat str
+ 0
+ (_lengthPS str)
+ True)
+
+
+
+search2PS :: _PackedString -- reg. exp
+ -> _PackedString -- string to match
+ -> _PackedString -- string to match
+ -> [Char] -- flags
+ -> Maybe REmatch
+search2PS reg str1 str2 flags
+ = let
+ insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags
+ len1 = _lengthPS str1
+ len2 = _lengthPS str2
+ len = len1+len2
+ in
+ unsafePerformPrimIO (
+ re_compile_pattern reg mode insensitive `thenPrimIO` \ pat ->
+ re_search2 pat
+ str1
+ str2
+ 0
+ len
+ len
+ True)
+
+
+
+\end{code}
+
+@_substrPS s st end@ cuts out the chunk in \tr{s} between \tr{st} and \tr{end}, inclusive.
+The \tr{Regex} registers represent substrings by storing the start and the end point plus
+one( st==end => empty string) , so we use @chunkPS@ instead.
+
+
+\begin{code}
+
+_chunkPS :: _PackedString
+ -> (Int,Int)
+ -> _PackedString
+_chunkPS str (st,end)
+ = if st==end then
+ _nilPS
+ else
+ _substrPS str st (max 0 (end-1))
+
+\end{code}
+
+Perl-like match and substitute
+
+\begin{code}
+
+substPS :: _PackedString -- reg. exp
+ -> _PackedString -- replacement
+ -> [Char] -- flags
+ -> _PackedString -- string
+ -> _PackedString
+substPS rexp
+ repl
+ flags
+ str
+ = search str
+ where
+ global = 'g' `elem` flags
+ case_insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags -- single-line mode
+ pat = unsafePerformPrimIO (
+ re_compile_pattern rexp mode case_insensitive)
+
+ search str
+ = let
+ search_res
+ = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
+ in
+ case search_res of
+ Nothing -> str
+ Just matcher@(REmatch arr before match after lst) ->
+ let
+ (st,en) = match
+ prefix = _chunkPS str before
+ suffix
+ = if global && (st /= en) then
+ search (_dropPS' en str)
+ else
+ _chunkPS str after
+ in
+ _concatPS [prefix,
+ replace matcher repl str,
+ suffix]
+
+
+replace :: REmatch
+ -> _PackedString
+ -> _PackedString
+ -> _PackedString
+replace (REmatch arr before@(_,b_end) match after lst)
+ replacement
+ str
+ = _concatPS (reverse acc) -- ToDo: write a `reversed' version of concatPS
+ where
+ (_,b) = bounds arr
+
+ acc = replace' [] replacement False
+
+ single :: Char -> _PackedString
+ single x = _consPS x _nilPS
+
+ replace' :: [_PackedString]
+ -> _PackedString
+ -> Bool
+ -> [_PackedString]
+ replace' acc repl escaped
+ = if (_nullPS repl) then
+ acc
+ else
+ let
+ x@(C# x#) = _headPS repl
+ xs = _tailPS' repl
+ in
+ case x# of
+ '\\'# ->
+ if escaped then
+ replace' acc xs True
+ else
+ replace' ((single x):acc) xs (not escaped)
+ '$'# ->
+ if (not escaped) then
+ let
+ x' = _headPS xs
+ xs' = _tailPS' xs
+ ith_ival = arr!num
+ (num,xs_num) = getNumber ((ord x') - ord '0') xs'
+ in
+ if (isDigit x') && (num<=b) then
+ replace' ((_chunkPS str ith_ival):acc) xs_num escaped
+ else if x' == '&' then
+ replace' ((_chunkPS str match):acc) xs' escaped
+ else if x' == '+' then
+ replace' ((_chunkPS str lst):acc) xs' escaped
+ else if x' == '`' then
+ replace' ((_chunkPS str (0,b_end)):acc) xs' escaped
+ else if x' == '\'' then
+ replace' ((_chunkPS str after):acc) xs' escaped
+ else -- ignore
+ replace' acc xs escaped
+ else
+ replace' ((single x):acc) xs False
+
+ _ -> if escaped then
+ (case x# of
+ 'n'# -> -- newline
+ replace' ((single '\n'):acc)
+ 'f'# -> -- formfeed
+ replace' ((single '\f'):acc)
+ 'r'# -> -- carriage return
+ replace' ((single '\r'):acc)
+ 't'# -> -- (horiz) tab
+ replace' ((single '\t'):acc)
+ 'v'# -> -- vertical tab
+ replace' ((single '\v'):acc)
+ 'a'# -> -- alarm bell
+ replace' ((single '\a'):acc)
+ 'e'# -> -- escape
+ replace' ((single '\033'):acc)
+ _ ->
+ replace' ((single x):acc)) xs False
+ else
+ replace' ((single x):acc) xs False
+
+
+getNumber :: Int -> _PackedString -> (Int,_PackedString)
+getNumber acc ps
+ = if _nullPS ps then
+ (acc,ps)
+ else
+ let
+ x = _headPS ps
+ xs = _tailPS ps
+ in
+ if (isDigit x) then
+ getNumber (acc*10+(ord x - ord '0')) xs
+ else
+ (acc,ps)
+
+\end{code}
+
+Just like substPS, but no prefix and suffix.
+
+\begin{code}
+
+replacePS :: _PackedString -- reg. exp
+ -> _PackedString -- replacement
+ -> [Char] -- flags
+ -> _PackedString -- string
+ -> _PackedString
+replacePS rexp
+ repl
+ flags
+ str
+ = search str
+ where
+ global = 'g' `elem` flags
+ case_insensitive = 'i' `elem` flags
+ mode = 's' `elem` flags -- single-line mode
+ pat = unsafePerformPrimIO (
+ re_compile_pattern rexp mode case_insensitive)
+
+ search str
+ = let
+ search_res
+ = unsafePerformPrimIO (re_search pat str 0 (_lengthPS str) True)
+ in
+ case search_res of
+ Nothing -> str
+ Just matcher@(REmatch arr before match after lst) ->
+ replace matcher repl str
+
+\end{code}
+
+Picking matched groups out of string
+
+\begin{code}
+
+getMatchesNo :: REmatch
+ -> Int
+getMatchesNo (REmatch arr _ _ _ _)
+ = snd (bounds arr)
+
+getMatchedGroup :: REmatch
+ -> Int
+ -> _PackedString
+ -> _PackedString
+getMatchedGroup (REmatch arr bef mtch after lst) nth str
+ = let
+ (1,grps) = bounds arr
+ in
+ if (nth >= 1) && (nth <= grps) then
+ _chunkPS str (arr!nth)
+ else
+ error "getMatchedGroup: group out of range"
+
+getWholeMatch :: REmatch
+ -> _PackedString
+ -> _PackedString
+getWholeMatch (REmatch _ _ mtch _ _) str
+ = _chunkPS str mtch
+
+getLastMatch :: REmatch
+ -> _PackedString
+ -> _PackedString
+getLastMatch (REmatch _ _ _ _ lst) str
+ = _chunkPS str lst
+
+getAfterMatch :: REmatch
+ -> _PackedString
+ -> _PackedString
+getAfterMatch (REmatch _ _ _ aft _) str
+ = _chunkPS str aft
+
+\end{code}
+
+
+More or less straight translation of a brute-force string matching
+function written in C. (Sedgewick ch. 18)
+
+This is intended to provide much the same facilities as index/rindex in perl.
+
+\begin{code}
+
+
+findPS :: _PackedString
+ -> _PackedString
+ -> Maybe Int
+findPS str substr
+ = let
+ m = _lengthPS substr
+ n = _lengthPS str
+
+ loop i j
+ | j>=m || i>=n = if j==m then (Just (i-m)) else Nothing
+ | otherwise
+ = inner_loop i j
+
+ inner_loop i j
+ = if j<m && i<n && (_indexPS str i /= _indexPS substr j) then
+ inner_loop (i-j+1) 0
+ else
+ loop (i+1) (j+1)
+ in
+ loop 0 0
+
+rfindPS :: _PackedString
+ -> _PackedString
+ -> Maybe Int
+rfindPS str substr
+ = let
+ m = _lengthPS substr - 1
+ n = _lengthPS str - 1
+
+ loop i j
+ | j<0 || i<0 = if j<0 then (Just (i+1)) else Nothing
+ | otherwise
+ = inner_loop i j
+
+ inner_loop i j
+ = if j>=0 && i>=0 && (_indexPS str i /= _indexPS substr j) then
+ inner_loop (i+(m-j)-1) m
+ else
+ loop (i-1) (j-1)
+ in
+ loop n m
+
+
+\end{code}
+
+\begin{code}
+
+chopPS :: _PackedString -> _PackedString
+chopPS str = if _nullPS str then
+ _nilPS
+ else
+ _chunkPS str (0,_lengthPS str-1)
+
+\end{code}
+
+Tries to match as much as possible of strA starting from the beginning of strB
+(handy when matching fancy literals in parsers)
+
+\begin{code}
+matchPrefixPS :: _PackedString
+ -> _PackedString
+ -> Int
+matchPrefixPS pref str
+ = matchPrefixPS' pref str 0
+ where
+ matchPrefixPS' pref str n
+ = if (_nullPS pref) || (_nullPS str) then
+ n
+ else if (_headPS pref) == (_headPS str) then
+ matchPrefixPS' (_tailPS pref) (_tailPS str) (n+1)
+ else
+ n
+
+\end{code}