------------------------------------------------------------------------------
--- |
--- Module : Text.ParserCombinators.Parsec.Token
--- Copyright : (c) Daan Leijen 1999-2001
--- License : BSD-style (see the file libraries/base/LICENSE)
---
--- Maintainer : daan@cs.uu.nl
--- Stability : provisional
--- Portability : non-portable (uses existentially quantified data constructors)
---
--- A helper module to parse lexical elements (tokens).
---
------------------------------------------------------------------------------
-
-module Text.ParserCombinators.Parsec.Token
- ( LanguageDef (..)
- , TokenParser (..)
- , makeTokenParser
- ) where
-
-import Data.Char (isAlpha,toLower,toUpper,isSpace,digitToInt)
-import Data.List (nub,sort)
-import Text.ParserCombinators.Parsec
-
-
------------------------------------------------------------
--- Language Definition
------------------------------------------------------------
-data LanguageDef st
- = LanguageDef
- { commentStart :: String
- , commentEnd :: String
- , commentLine :: String
- , nestedComments :: Bool
- , identStart :: CharParser st Char
- , identLetter :: CharParser st Char
- , opStart :: CharParser st Char
- , opLetter :: CharParser st Char
- , reservedNames :: [String]
- , reservedOpNames:: [String]
- , caseSensitive :: Bool
- }
-
------------------------------------------------------------
--- A first class module: TokenParser
------------------------------------------------------------
-data TokenParser st
- = TokenParser{ identifier :: CharParser st String
- , reserved :: String -> CharParser st ()
- , operator :: CharParser st String
- , reservedOp :: String -> CharParser st ()
-
- , charLiteral :: CharParser st Char
- , stringLiteral :: CharParser st String
- , natural :: CharParser st Integer
- , integer :: CharParser st Integer
- , float :: CharParser st Double
- , naturalOrFloat :: CharParser st (Either Integer Double)
- , decimal :: CharParser st Integer
- , hexadecimal :: CharParser st Integer
- , octal :: CharParser st Integer
-
- , symbol :: String -> CharParser st String
- , lexeme :: forall a. CharParser st a -> CharParser st a
- , whiteSpace :: CharParser st ()
-
- , parens :: forall a. CharParser st a -> CharParser st a
- , braces :: forall a. CharParser st a -> CharParser st a
- , angles :: forall a. CharParser st a -> CharParser st a
- , brackets :: forall a. CharParser st a -> CharParser st a
- -- "squares" is deprecated
- , squares :: forall a. CharParser st a -> CharParser st a
-
- , semi :: CharParser st String
- , comma :: CharParser st String
- , colon :: CharParser st String
- , dot :: CharParser st String
- , semiSep :: forall a . CharParser st a -> CharParser st [a]
- , semiSep1 :: forall a . CharParser st a -> CharParser st [a]
- , commaSep :: forall a . CharParser st a -> CharParser st [a]
- , commaSep1 :: forall a . CharParser st a -> CharParser st [a]
- }
-
------------------------------------------------------------
--- Given a LanguageDef, create a token parser.
------------------------------------------------------------
-makeTokenParser :: LanguageDef st -> TokenParser st
-makeTokenParser languageDef
- = TokenParser{ identifier = identifier
- , reserved = reserved
- , operator = operator
- , reservedOp = reservedOp
-
- , charLiteral = charLiteral
- , stringLiteral = stringLiteral
- , natural = natural
- , integer = integer
- , float = float
- , naturalOrFloat = naturalOrFloat
- , decimal = decimal
- , hexadecimal = hexadecimal
- , octal = octal
-
- , symbol = symbol
- , lexeme = lexeme
- , whiteSpace = whiteSpace
-
- , parens = parens
- , braces = braces
- , angles = angles
- , brackets = brackets
- , squares = brackets
- , semi = semi
- , comma = comma
- , colon = colon
- , dot = dot
- , semiSep = semiSep
- , semiSep1 = semiSep1
- , commaSep = commaSep
- , commaSep1 = commaSep1
- }
- where
-
- -----------------------------------------------------------
- -- Bracketing
- -----------------------------------------------------------
- parens p = between (symbol "(") (symbol ")") p
- braces p = between (symbol "{") (symbol "}") p
- angles p = between (symbol "<") (symbol ">") p
- brackets p = between (symbol "[") (symbol "]") p
-
- semi = symbol ";"
- comma = symbol ","
- dot = symbol "."
- colon = symbol ":"
-
- commaSep p = sepBy p comma
- semiSep p = sepBy p semi
-
- commaSep1 p = sepBy1 p comma
- semiSep1 p = sepBy1 p semi
-
-
- -----------------------------------------------------------
- -- Chars & Strings
- -----------------------------------------------------------
- -- charLiteral :: CharParser st Char
- charLiteral = lexeme (between (char '\'')
- (char '\'' <?> "end of character")
- characterChar )
- <?> "character"
-
- characterChar = charLetter <|> charEscape
- <?> "literal character"
-
- charEscape = do{ char '\\'; escapeCode }
- charLetter = satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
-
-
-
- -- stringLiteral :: CharParser st String
- stringLiteral = lexeme (
- do{ str <- between (char '"')
- (char '"' <?> "end of string")
- (many stringChar)
- ; return (foldr (maybe id (:)) "" str)
- }
- <?> "literal string")
-
- -- stringChar :: CharParser st (Maybe Char)
- stringChar = do{ c <- stringLetter; return (Just c) }
- <|> stringEscape
- <?> "string character"
-
- stringLetter = satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
-
- stringEscape = do{ char '\\'
- ; do{ escapeGap ; return Nothing }
- <|> do{ escapeEmpty; return Nothing }
- <|> do{ esc <- escapeCode; return (Just esc) }
- }
-
- escapeEmpty = char '&'
- escapeGap = do{ many1 space
- ; char '\\' <?> "end of string gap"
- }
-
-
-
- -- escape codes
- escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
- <?> "escape code"
-
- -- charControl :: CharParser st Char
- charControl = do{ char '^'
- ; code <- upper
- ; return (toEnum (fromEnum code - fromEnum 'A'))
- }
-
- -- charNum :: CharParser st Char
- charNum = do{ code <- decimal
- <|> do{ char 'o'; number 8 octDigit }
- <|> do{ char 'x'; number 16 hexDigit }
- ; return (toEnum (fromInteger code))
- }
-
- charEsc = choice (map parseEsc escMap)
- where
- parseEsc (c,code) = do{ char c; return code }
-
- charAscii = choice (map parseAscii asciiMap)
- where
- parseAscii (asc,code) = try (do{ string asc; return code })
-
-
- -- escape code tables
- escMap = zip ("abfnrtv\\\"\'") ("\a\b\f\n\r\t\v\\\"\'")
- asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
-
- ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
- "FS","GS","RS","US","SP"]
- ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
- "DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
- "CAN","SUB","ESC","DEL"]
-
- ascii2 = ['\BS','\HT','\LF','\VT','\FF','\CR','\SO','\SI',
- '\EM','\FS','\GS','\RS','\US','\SP']
- ascii3 = ['\NUL','\SOH','\STX','\ETX','\EOT','\ENQ','\ACK',
- '\BEL','\DLE','\DC1','\DC2','\DC3','\DC4','\NAK',
- '\SYN','\ETB','\CAN','\SUB','\ESC','\DEL']
-
-
- -----------------------------------------------------------
- -- Numbers
- -----------------------------------------------------------
- -- naturalOrFloat :: CharParser st (Either Integer Double)
- naturalOrFloat = lexeme (natFloat) <?> "number"
-
- float = lexeme floating <?> "float"
- integer = lexeme int <?> "integer"
- natural = lexeme nat <?> "natural"
-
-
- -- floats
- floating = do{ n <- decimal
- ; fractExponent n
- }
-
-
- natFloat = do{ char '0'
- ; zeroNumFloat
- }
- <|> decimalFloat
-
- zeroNumFloat = do{ n <- hexadecimal <|> octal
- ; return (Left n)
- }
- <|> decimalFloat
- <|> fractFloat 0
- <|> return (Left 0)
-
- decimalFloat = do{ n <- decimal
- ; option (Left n)
- (fractFloat n)
- }
-
- fractFloat n = do{ f <- fractExponent n
- ; return (Right f)
- }
-
- fractExponent n = do{ fract <- fraction
- ; expo <- option 1.0 exponent'
- ; return ((fromInteger n + fract)*expo)
- }
- <|>
- do{ expo <- exponent'
- ; return ((fromInteger n)*expo)
- }
-
- fraction = do{ char '.'
- ; digits <- many1 digit <?> "fraction"
- ; return (foldr op 0.0 digits)
- }
- <?> "fraction"
- where
- op d f = (f + fromIntegral (digitToInt d))/10.0
-
- exponent' = do{ oneOf "eE"
- ; f <- sign
- ; e <- decimal <?> "exponent"
- ; return (power (f e))
- }
- <?> "exponent"
- where
- power e | e < 0 = 1.0/power(-e)
- | otherwise = fromInteger (10^e)
-
-
- -- integers and naturals
- int = do{ f <- lexeme sign
- ; n <- nat
- ; return (f n)
- }
-
- -- sign :: CharParser st (Integer -> Integer)
- sign = (char '-' >> return negate)
- <|> (char '+' >> return id)
- <|> return id
-
- nat = zeroNumber <|> decimal
-
- zeroNumber = do{ char '0'
- ; hexadecimal <|> octal <|> decimal <|> return 0
- }
- <?> ""
-
- decimal = number 10 digit
- hexadecimal = do{ oneOf "xX"; number 16 hexDigit }
- octal = do{ oneOf "oO"; number 8 octDigit }
-
- -- number :: Integer -> CharParser st Char -> CharParser st Integer
- number base baseDigit
- = do{ digits <- many1 baseDigit
- ; let n = foldl (\x d -> base*x + toInteger (digitToInt d)) 0 digits
- ; seq n (return n)
- }
-
- -----------------------------------------------------------
- -- Operators & reserved ops
- -----------------------------------------------------------
- reservedOp name =
- lexeme $ try $
- do{ string name
- ; notFollowedBy (opLetter languageDef) <?> ("end of " ++ show name)
- }
-
- operator =
- lexeme $ try $
- do{ name <- oper
- ; if (isReservedOp name)
- then unexpected ("reserved operator " ++ show name)
- else return name
- }
-
- oper =
- do{ c <- (opStart languageDef)
- ; cs <- many (opLetter languageDef)
- ; return (c:cs)
- }
- <?> "operator"
-
- isReservedOp name =
- isReserved (sort (reservedOpNames languageDef)) name
-
-
- -----------------------------------------------------------
- -- Identifiers & Reserved words
- -----------------------------------------------------------
- reserved name =
- lexeme $ try $
- do{ caseString name
- ; notFollowedBy (identLetter languageDef) <?> ("end of " ++ show name)
- }
-
- caseString name
- | caseSensitive languageDef = string name
- | otherwise = do{ walk name; return name }
- where
- walk [] = return ()
- walk (c:cs) = do{ caseChar c <?> msg; walk cs }
-
- caseChar c | isAlpha c = char (toLower c) <|> char (toUpper c)
- | otherwise = char c
-
- msg = show name
-
-
- identifier =
- lexeme $ try $
- do{ name <- ident
- ; if (isReservedName name)
- then unexpected ("reserved word " ++ show name)
- else return name
- }
-
-
- ident
- = do{ c <- identStart languageDef
- ; cs <- many (identLetter languageDef)
- ; return (c:cs)
- }
- <?> "identifier"
-
- isReservedName name
- = isReserved theReservedNames caseName
- where
- caseName | caseSensitive languageDef = name
- | otherwise = map toLower name
-
-
- isReserved names name
- = scan names
- where
- scan [] = False
- scan (r:rs) = case (compare r name) of
- LT -> scan rs
- EQ -> True
- GT -> False
-
- theReservedNames
- | caseSensitive languageDef = sortedNames
- | otherwise = map (map toLower) sortedNames
- where
- sortedNames = sort (reservedNames languageDef)
-
-
-
- -----------------------------------------------------------
- -- White space & symbols
- -----------------------------------------------------------
- symbol name
- = lexeme (string name)
-
- lexeme p
- = do{ x <- p; whiteSpace; return x }
-
-
- --whiteSpace
- whiteSpace
- | noLine && noMulti = skipMany (simpleSpace <?> "")
- | noLine = skipMany (simpleSpace <|> multiLineComment <?> "")
- | noMulti = skipMany (simpleSpace <|> oneLineComment <?> "")
- | otherwise = skipMany (simpleSpace <|> oneLineComment <|> multiLineComment <?> "")
- where
- noLine = null (commentLine languageDef)
- noMulti = null (commentStart languageDef)
-
-
- simpleSpace =
- skipMany1 (satisfy isSpace)
-
- oneLineComment =
- do{ try (string (commentLine languageDef))
- ; skipMany (satisfy (/= '\n'))
- ; return ()
- }
-
- multiLineComment =
- do { try (string (commentStart languageDef))
- ; inComment
- }
-
- inComment
- | nestedComments languageDef = inCommentMulti
- | otherwise = inCommentSingle
-
- inCommentMulti
- = do{ try (string (commentEnd languageDef)) ; return () }
- <|> do{ multiLineComment ; inCommentMulti }
- <|> do{ skipMany1 (noneOf startEnd) ; inCommentMulti }
- <|> do{ oneOf startEnd ; inCommentMulti }
- <?> "end of comment"
- where
- startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
-
- inCommentSingle
- = do{ try (string (commentEnd languageDef)); return () }
- <|> do{ skipMany1 (noneOf startEnd) ; inCommentSingle }
- <|> do{ oneOf startEnd ; inCommentSingle }
- <?> "end of comment"
- where
- startEnd = nub (commentEnd languageDef ++ commentStart languageDef)
-