From: ross Date: Mon, 14 Mar 2005 18:02:49 +0000 (+0000) Subject: [project @ 2005-03-14 18:02:48 by ross] X-Git-Tag: arity-anal-branch-point~26 X-Git-Url: http://git.megacz.com/?p=ghc-base.git;a=commitdiff_plain;h=62de77e9a5d1f19ab333f1796437e1b16f78d23e [project @ 2005-03-14 18:02:48 by ross] move general categories and derived predicates to Data.Char --- diff --git a/Data/Char.hs b/Data/Char.hs index 2ec1e66..fa0e899 100644 --- a/Data/Char.hs +++ b/Data/Char.hs @@ -27,9 +27,11 @@ module Data.Char , isLower, isUpper, isAlpha, isAlphaNum, isPrint , isDigit, isOctDigit, isHexDigit , isAsciiUpper, isAsciiLower +#ifndef __NHC__ , isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator , GeneralCategory(..), generalCategory +#endif -- * Case conversion , toUpper, toLower, toTitle -- :: Char -> Char @@ -52,10 +54,12 @@ module Data.Char #ifdef __GLASGOW_HASKELL__ import GHC.Base +import GHC.Real (fromIntegral) import GHC.Show -import GHC.Read (readLitChar, lexLitChar) +import GHC.Read (Read, readLitChar, lexLitChar) import GHC.Unicode import GHC.Num +import GHC.Enum #endif #ifdef __HUGS__ @@ -78,3 +82,113 @@ digitToInt c | c >= 'a' && c <= 'f' = ord c - ord 'a' + 10 | c >= 'A' && c <= 'F' = ord c - ord 'A' + 10 | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh + +#ifndef __GLASGOW_HASKELL__ +isAsciiUpper, isAsciiLower :: Char -> Bool +isAsciiLower c = c >= 'a' && c <= 'z' +isAsciiUpper c = c >= 'A' && c <= 'Z' +#endif + +#ifndef __NHC__ +-- | Unicode General Categories (column 2 of the UnicodeData table) +-- in the order they are listed in the Unicode standard. + +data GeneralCategory + = UppercaseLetter -- ^ Lu: Letter, Uppercase + | LowercaseLetter -- ^ Ll: Letter, Lowercase + | TitlecaseLetter -- ^ Lt: Letter, Titlecase + | ModifierLetter -- ^ Lm: Letter, Modifier + | OtherLetter -- ^ Lo: Letter, Other + | NonSpacingMark -- ^ Mn: Mark, Non-Spacing + | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining + | EnclosingMark -- ^ Me: Mark, Enclosing + | DecimalNumber -- ^ Nd: Number, Decimal + | LetterNumber -- ^ Nl: Number, Letter + | OtherNumber -- ^ No: Number, Other + | ConnectorPunctuation -- ^ Pc: Punctuation, Connector + | DashPunctuation -- ^ Pd: Punctuation, Dash + | OpenPunctuation -- ^ Ps: Punctuation, Open + | ClosePunctuation -- ^ Pe: Punctuation, Close + | InitialQuote -- ^ Pi: Punctuation, Initial quote + | FinalQuote -- ^ Pf: Punctuation, Final quote + | OtherPunctuation -- ^ Po: Punctuation, Other + | MathSymbol -- ^ Sm: Symbol, Math + | CurrencySymbol -- ^ Sc: Symbol, Currency + | ModifierSymbol -- ^ Sk: Symbol, Modifier + | OtherSymbol -- ^ So: Symbol, Other + | Space -- ^ Zs: Separator, Space + | LineSeparator -- ^ Zl: Separator, Line + | ParagraphSeparator -- ^ Zp: Separator, Paragraph + | Control -- ^ Cc: Other, Control + | Format -- ^ Cf: Other, Format + | Surrogate -- ^ Cs: Other, Surrogate + | PrivateUse -- ^ Co: Other, Private Use + | NotAssigned -- ^ Cn: Other, Not Assigned + deriving (Eq, Ord, Enum, Read, Show, Bounded) + +-- | Retrieves the general Unicode category of the character. +generalCategory :: Char -> GeneralCategory +#ifdef __GLASGOW_HASKELL__ +generalCategory c = toEnum (wgencat (fromIntegral (ord c))) +#endif +#ifdef __HUGS__ +generalCategory c = toEnum (primUniGenCat c) +#endif + +-- derived character classifiers + +isLetter :: Char -> Bool +isLetter c = case generalCategory c of + UppercaseLetter -> True + LowercaseLetter -> True + TitlecaseLetter -> True + ModifierLetter -> True + OtherLetter -> True + _ -> False + +isMark :: Char -> Bool +isMark c = case generalCategory c of + NonSpacingMark -> True + SpacingCombiningMark -> True + EnclosingMark -> True + _ -> False + +isNumber :: Char -> Bool +isNumber c = case generalCategory c of + DecimalNumber -> True + LetterNumber -> True + OtherNumber -> True + _ -> False + +isPunctuation :: Char -> Bool +isPunctuation c = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OpenPunctuation -> True + ClosePunctuation -> True + InitialQuote -> True + FinalQuote -> True + OtherPunctuation -> True + _ -> False + +isSymbol :: Char -> Bool +isSymbol c = case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + +isSeparator :: Char -> Bool +isSeparator c = case generalCategory c of + Space -> True + LineSeparator -> True + ParagraphSeparator -> True + _ -> False +#endif /* !__NHC__ */ + +#ifdef __NHC__ +-- dummy implementation +toTitle :: Char -> Char +toTitle = toUpper +#endif diff --git a/GHC/Unicode.hs b/GHC/Unicode.hs index 6a4732b..da74685 100644 --- a/GHC/Unicode.hs +++ b/GHC/Unicode.hs @@ -2,7 +2,7 @@ {-# OPTIONS -#include "WCsubst.h" #-} ----------------------------------------------------------------------------- -- | --- Module : GHC.Unicde +-- Module : GHC.Unicode -- Copyright : (c) The University of Glasgow, 2003 -- License : see libraries/base/LICENSE -- @@ -17,20 +17,13 @@ ----------------------------------------------------------------------------- module GHC.Unicode ( - GeneralCategory (..), - generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, toUpper, toLower, toTitle, - isLetter, -- :: Char -> Bool - isMark, -- :: Char -> Bool - isNumber, -- :: Char -> Bool - isPunctuation, -- :: Char -> Bool - isSymbol, -- :: Char -> Bool - isSeparator, -- :: Char -> Bool + wgencat, ) where import GHC.Base @@ -38,106 +31,9 @@ import GHC.Real (fromIntegral) import GHC.Int import GHC.Word import GHC.Num (fromInteger) -import GHC.Read -import GHC.Show -import GHC.Enum #include "HsBaseConfig.h" --- | Unicode General Categories (column 2 of the UnicodeData table) --- in the order they are listed in the Unicode standard. - -data GeneralCategory - = UppercaseLetter -- Lu Letter, Uppercase - | LowercaseLetter -- Ll Letter, Lowercase - | TitlecaseLetter -- Lt Letter, Titlecase - | ModifierLetter -- Lm Letter, Modifier - | OtherLetter -- Lo Letter, Other - | NonSpacingMark -- Mn Mark, Non-Spacing - | SpacingCombiningMark -- Mc Mark, Spacing Combining - | EnclosingMark -- Me Mark, Enclosing - | DecimalNumber -- Nd Number, Decimal - | LetterNumber -- Nl Number, Letter - | OtherNumber -- No Number, Other - | ConnectorPunctuation -- Pc Punctuation, Connector - | DashPunctuation -- Pd Punctuation, Dash - | OpenPunctuation -- Ps Punctuation, Open - | ClosePunctuation -- Pe Punctuation, Close - | InitialQuote -- Pi Punctuation, Initial quote - | FinalQuote -- Pf Punctuation, Final quote - | OtherPunctuation -- Po Punctuation, Other - | MathSymbol -- Sm Symbol, Math - | CurrencySymbol -- Sc Symbol, Currency - | ModifierSymbol -- Sk Symbol, Modifier - | OtherSymbol -- So Symbol, Other - | Space -- Zs Separator, Space - | LineSeparator -- Zl Separator, Line - | ParagraphSeparator -- Zp Separator, Paragraph - | Control -- Cc Other, Control - | Format -- Cf Other, Format - | Surrogate -- Cs Other, Surrogate - | PrivateUse -- Co Other, Private Use - | NotAssigned -- Cn Other, Not Assigned - deriving (Eq, Ord, Enum, Read, Show, Bounded) - --- | Retrieves the general Unicode category of the character. -generalCategory :: Char -> GeneralCategory -generalCategory c = toEnum (wgencat (fromIntegral (ord c))) - --- ------------------------------------------------------------------------ --- These are copied from Hugs Unicode.hs - --- derived character classifiers - -isLetter :: Char -> Bool -isLetter c = case generalCategory c of - UppercaseLetter -> True - LowercaseLetter -> True - TitlecaseLetter -> True - ModifierLetter -> True - OtherLetter -> True - _ -> False - -isMark :: Char -> Bool -isMark c = case generalCategory c of - NonSpacingMark -> True - SpacingCombiningMark -> True - EnclosingMark -> True - _ -> False - -isNumber :: Char -> Bool -isNumber c = case generalCategory c of - DecimalNumber -> True - LetterNumber -> True - OtherNumber -> True - _ -> False - -isPunctuation :: Char -> Bool -isPunctuation c = case generalCategory c of - ConnectorPunctuation -> True - DashPunctuation -> True - OpenPunctuation -> True - ClosePunctuation -> True - InitialQuote -> True - FinalQuote -> True - OtherPunctuation -> True - _ -> False - -isSymbol :: Char -> Bool -isSymbol c = case generalCategory c of - MathSymbol -> True - CurrencySymbol -> True - ModifierSymbol -> True - OtherSymbol -> True - _ -> False - -isSeparator :: Char -> Bool -isSeparator c = case generalCategory c of - Space -> True - LineSeparator -> True - ParagraphSeparator -> True - _ -> False - -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool