X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=Data%2FChar.hs;h=40052a760761b9e263efe5e030e900d251c2bb59;hb=HEAD;hp=e7d7b729e827768df4acb5ff71a764d289fa1503;hpb=746ef6a7fd71bb1e9ebe3cd107c5f9f79f3b7a68;p=ghc-base.git diff --git a/Data/Char.hs b/Data/Char.hs index e7d7b72..40052a7 100644 --- a/Data/Char.hs +++ b/Data/Char.hs @@ -1,50 +1,210 @@ -{-# OPTIONS -fno-implicit-prelude #-} +{-# LANGUAGE CPP, NoImplicitPrelude #-} + ----------------------------------------------------------------------------- -- | -- Module : Data.Char -- Copyright : (c) The University of Glasgow 2001 --- License : BSD-style (see the file libraries/core/LICENSE) +-- License : BSD-style (see the file libraries/base/LICENSE) -- -- Maintainer : libraries@haskell.org --- Stability : provisional +-- Stability : stable -- Portability : portable -- -- The Char type and associated operations. -- ----------------------------------------------------------------------------- -module Data.Char +module Data.Char ( Char - , isAscii, isLatin1, isControl - , isPrint, isSpace, isUpper - , isLower, isAlpha, isDigit - , isOctDigit, isHexDigit, isAlphaNum -- :: Char -> Bool + -- * Character classification + -- | Unicode characters are divided into letters, numbers, marks, + -- punctuation, symbols, separators (including spaces) and others + -- (including control characters). + , isControl, isSpace + , isLower, isUpper, isAlpha, isAlphaNum, isPrint + , isDigit, isOctDigit, isHexDigit + , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator + + -- ** Subranges + , isAscii, isLatin1 + , isAsciiUpper, isAsciiLower - , toUpper, toLower -- :: Char -> Char + -- ** Unicode general categories + , GeneralCategory(..), generalCategory + -- * Case conversion + , toUpper, toLower, toTitle -- :: Char -> Char + + -- * Single digit characters , digitToInt -- :: Char -> Int , intToDigit -- :: Int -> Char + -- * Numeric representations , ord -- :: Char -> Int , chr -- :: Int -> Char - , readLitChar -- :: ReadS Char - , showLitChar -- :: Char -> ShowS - , lexLitChar -- :: ReadS String - , String + -- * String representations + , showLitChar -- :: Char -> ShowS + , lexLitChar -- :: ReadS String + , readLitChar -- :: ReadS Char -- Implementation checked wrt. Haskell 98 lib report, 1/99. ) where #ifdef __GLASGOW_HASKELL__ import GHC.Base +import GHC.Arr (Ix) +import GHC.Real (fromIntegral) import GHC.Show -import GHC.Read (readLitChar, lexLitChar) +import GHC.Read (Read, readLitChar, lexLitChar) +import GHC.Unicode +import GHC.Num +import GHC.Enum +#endif + +#ifdef __HUGS__ +import Hugs.Prelude (Ix) +import Hugs.Char +#endif + +#ifdef __NHC__ +import Prelude +import Prelude(Char,String) +import Char +import Ix +import NHC.FFI (CInt) +foreign import ccall unsafe "WCsubst.h u_gencat" wgencat :: CInt -> CInt +#endif + +-- | Convert a single digit 'Char' to the corresponding 'Int'. +-- This function fails unless its argument satisfies 'isHexDigit', +-- but recognises both upper and lower-case hexadecimal digits +-- (i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@). +digitToInt :: Char -> Int +digitToInt c + | isDigit c = ord c - ord '0' + | c >= 'a' && c <= 'f' = ord c - ord 'a' + 10 + | c >= 'A' && c <= 'F' = ord c - ord 'A' + 10 + | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh + +#ifndef __GLASGOW_HASKELL__ +isAsciiUpper, isAsciiLower :: Char -> Bool +isAsciiLower c = c >= 'a' && c <= 'z' +isAsciiUpper c = c >= 'A' && c <= 'Z' #endif +-- | Unicode General Categories (column 2 of the UnicodeData table) +-- in the order they are listed in the Unicode standard. + +data GeneralCategory + = UppercaseLetter -- ^ Lu: Letter, Uppercase + | LowercaseLetter -- ^ Ll: Letter, Lowercase + | TitlecaseLetter -- ^ Lt: Letter, Titlecase + | ModifierLetter -- ^ Lm: Letter, Modifier + | OtherLetter -- ^ Lo: Letter, Other + | NonSpacingMark -- ^ Mn: Mark, Non-Spacing + | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining + | EnclosingMark -- ^ Me: Mark, Enclosing + | DecimalNumber -- ^ Nd: Number, Decimal + | LetterNumber -- ^ Nl: Number, Letter + | OtherNumber -- ^ No: Number, Other + | ConnectorPunctuation -- ^ Pc: Punctuation, Connector + | DashPunctuation -- ^ Pd: Punctuation, Dash + | OpenPunctuation -- ^ Ps: Punctuation, Open + | ClosePunctuation -- ^ Pe: Punctuation, Close + | InitialQuote -- ^ Pi: Punctuation, Initial quote + | FinalQuote -- ^ Pf: Punctuation, Final quote + | OtherPunctuation -- ^ Po: Punctuation, Other + | MathSymbol -- ^ Sm: Symbol, Math + | CurrencySymbol -- ^ Sc: Symbol, Currency + | ModifierSymbol -- ^ Sk: Symbol, Modifier + | OtherSymbol -- ^ So: Symbol, Other + | Space -- ^ Zs: Separator, Space + | LineSeparator -- ^ Zl: Separator, Line + | ParagraphSeparator -- ^ Zp: Separator, Paragraph + | Control -- ^ Cc: Other, Control + | Format -- ^ Cf: Other, Format + | Surrogate -- ^ Cs: Other, Surrogate + | PrivateUse -- ^ Co: Other, Private Use + | NotAssigned -- ^ Cn: Other, Not Assigned + deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix) + +-- | The Unicode general category of the character. +generalCategory :: Char -> GeneralCategory +#if defined(__GLASGOW_HASKELL__) || defined(__NHC__) +generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c +#endif #ifdef __HUGS__ -isLatin1 c = True +generalCategory c = toEnum (primUniGenCat c) #endif +-- derived character classifiers + +-- | Selects alphabetic Unicode characters (lower-case, upper-case and +-- title-case letters, plus letters of caseless scripts and modifiers letters). +-- This function is equivalent to 'Data.Char.isAlpha'. +isLetter :: Char -> Bool +isLetter c = case generalCategory c of + UppercaseLetter -> True + LowercaseLetter -> True + TitlecaseLetter -> True + ModifierLetter -> True + OtherLetter -> True + _ -> False + +-- | Selects Unicode mark characters, e.g. accents and the like, which +-- combine with preceding letters. +isMark :: Char -> Bool +isMark c = case generalCategory c of + NonSpacingMark -> True + SpacingCombiningMark -> True + EnclosingMark -> True + _ -> False + +-- | Selects Unicode numeric characters, including digits from various +-- scripts, Roman numerals, etc. +isNumber :: Char -> Bool +isNumber c = case generalCategory c of + DecimalNumber -> True + LetterNumber -> True + OtherNumber -> True + _ -> False + +-- | Selects Unicode punctuation characters, including various kinds +-- of connectors, brackets and quotes. +isPunctuation :: Char -> Bool +isPunctuation c = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OpenPunctuation -> True + ClosePunctuation -> True + InitialQuote -> True + FinalQuote -> True + OtherPunctuation -> True + _ -> False + +-- | Selects Unicode symbol characters, including mathematical and +-- currency symbols. +isSymbol :: Char -> Bool +isSymbol c = case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + +-- | Selects Unicode space and separator characters. +isSeparator :: Char -> Bool +isSeparator c = case generalCategory c of + Space -> True + LineSeparator -> True + ParagraphSeparator -> True + _ -> False + +#ifdef __NHC__ +-- dummy implementation +toTitle :: Char -> Char +toTitle = toUpper +#endif