X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FUnicode.hs;h=b34f677069d1f12fb6d6514192b0b33886747122;hb=41e8fba828acbae1751628af50849f5352b27873;hp=8aefad5f2eb1aaae32bced502c09f09aeba66bfa;hpb=1e076454862442966918e6b1b9095e43eab7c770;p=ghc-base.git diff --git a/GHC/Unicode.hs b/GHC/Unicode.hs index 8aefad5..b34f677 100644 --- a/GHC/Unicode.hs +++ b/GHC/Unicode.hs @@ -1,8 +1,10 @@ -{-# OPTIONS -fno-implicit-prelude #-} +{-# LANGUAGE CPP, NoImplicitPrelude, ForeignFunctionInterface #-} {-# OPTIONS -#include "WCsubst.h" #-} +{-# OPTIONS_HADDOCK hide #-} + ----------------------------------------------------------------------------- -- | --- Module : GHC.Unicde +-- Module : GHC.Unicode -- Copyright : (c) The University of Glasgow, 2003 -- License : see libraries/base/LICENSE -- @@ -16,140 +18,41 @@ -- ----------------------------------------------------------------------------- +-- #hide module GHC.Unicode ( - GeneralCategory (..), - generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, toUpper, toLower, toTitle, - isLetter, -- :: Char -> Bool - isMark, -- :: Char -> Bool - isNumber, -- :: Char -> Bool - isPunctuation, -- :: Char -> Bool - isSymbol, -- :: Char -> Bool - isSeparator, -- :: Char -> Bool + wgencat, ) where import GHC.Base -import GHC.Real (fromIntegral) -import GHC.Int -import GHC.Word -import GHC.Num (fromInteger) -import GHC.Read -import GHC.Show -import GHC.Enum +import GHC.Real (fromIntegral) +import Foreign.C.Types (CInt) #include "HsBaseConfig.h" --- | Unicode General Categories (column 2 of the UnicodeData table) --- in the order they are listed in the Unicode standard. - -data GeneralCategory - = UppercaseLetter -- Lu Letter, Uppercase - | LowercaseLetter -- Ll Letter, Lowercase - | TitlecaseLetter -- Lt Letter, Titlecase - | ModifierLetter -- Lm Letter, Modifier - | OtherLetter -- Lo Letter, Other - | NonSpacingMark -- Mn Mark, Non-Spacing - | SpacingCombiningMark -- Mc Mark, Spacing Combining - | EnclosingMark -- Me Mark, Enclosing - | DecimalNumber -- Nd Number, Decimal - | LetterNumber -- Nl Number, Letter - | OtherNumber -- No Number, Other - | ConnectorPunctuation -- Pc Punctuation, Connector - | DashPunctuation -- Pd Punctuation, Dash - | OpenPunctuation -- Ps Punctuation, Open - | ClosePunctuation -- Pe Punctuation, Close - | InitialQuote -- Pi Punctuation, Initial quote - | FinalQuote -- Pf Punctuation, Final quote - | OtherPunctuation -- Po Punctuation, Other - | MathSymbol -- Sm Symbol, Math - | CurrencySymbol -- Sc Symbol, Currency - | ModifierSymbol -- Sk Symbol, Modifier - | OtherSymbol -- So Symbol, Other - | Space -- Zs Separator, Space - | LineSeparator -- Zl Separator, Line - | ParagraphSeparator -- Zp Separator, Paragraph - | Control -- Cc Other, Control - | Format -- Cf Other, Format - | Surrogate -- Cs Other, Surrogate - | PrivateUse -- Co Other, Private Use - | NotAssigned -- Cn Other, Not Assigned - deriving (Eq, Ord, Enum, Read, Show, Bounded) - --- | Retrieves the general Unicode category of the character. -generalCategory :: Char -> GeneralCategory -generalCategory c = toEnum (wgencat (fromIntegral (ord c))) - --- ------------------------------------------------------------------------ --- These are copied from Hugs Unicode.hs - --- derived character classifiers - -isLetter :: Char -> Bool -isLetter c = case generalCategory c of - UppercaseLetter -> True - LowercaseLetter -> True - TitlecaseLetter -> True - ModifierLetter -> True - OtherLetter -> True - _ -> False - -isMark :: Char -> Bool -isMark c = case generalCategory c of - NonSpacingMark -> True - SpacingCombiningMark -> True - EnclosingMark -> True - _ -> False - -isNumber :: Char -> Bool -isNumber c = case generalCategory c of - DecimalNumber -> True - LetterNumber -> True - OtherNumber -> True - _ -> False - -isPunctuation :: Char -> Bool -isPunctuation c = case generalCategory c of - ConnectorPunctuation -> True - DashPunctuation -> True - OpenPunctuation -> True - ClosePunctuation -> True - InitialQuote -> True - FinalQuote -> True - OtherPunctuation -> True - _ -> False - -isSymbol :: Char -> Bool -isSymbol c = case generalCategory c of - MathSymbol -> True - CurrencySymbol -> True - ModifierSymbol -> True - OtherSymbol -> True - _ -> False - -isSeparator :: Char -> Bool -isSeparator c = case generalCategory c of - Space -> True - LineSeparator -> True - ParagraphSeparator -> True - _ -> False - -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool -isAscii c = c < '\x80' +isAscii c = c < '\x80' -- | Selects the first 256 characters of the Unicode character set, -- corresponding to the ISO 8859-1 (Latin-1) character set. isLatin1 :: Char -> Bool isLatin1 c = c <= '\xff' -isAsciiUpper, isAsciiLower :: Char -> Bool +-- | Selects ASCII lower-case letters, +-- i.e. characters satisfying both 'isAscii' and 'isLower'. +isAsciiLower :: Char -> Bool isAsciiLower c = c >= 'a' && c <= 'z' + +-- | Selects ASCII upper-case letters, +-- i.e. characters satisfying both 'isAscii' and 'isUpper'. +isAsciiUpper :: Char -> Bool isAsciiUpper c = c >= 'A' && c <= 'Z' -- | Selects control characters, which are the non-printing characters of @@ -160,30 +63,32 @@ isControl :: Char -> Bool -- (letters, numbers, marks, punctuation, symbols and spaces). isPrint :: Char -> Bool --- | Selects white-space characters in the Latin-1 range. --- (In Unicode terms, this includes spaces and some control characters.) +-- | Returns 'True' for any Unicode space character, and the control +-- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@. isSpace :: Char -> Bool -- isSpace includes non-breaking space -- Done with explicit equalities both for efficiency, and to avoid a tiresome -- recursion with GHC.List elem -isSpace c = c == ' ' || - c == '\t' || - c == '\n' || - c == '\r' || - c == '\f' || - c == '\v' || - c == '\xa0' || - iswspace (fromIntegral (ord c)) /= 0 - --- | Selects alphabetic Unicode characters (letters) that are not lower-case. --- (In Unicode terms, this includes letters in upper and title cases, --- as well as modifier letters and other letters.) +isSpace c = c == ' ' || + c == '\t' || + c == '\n' || + c == '\r' || + c == '\f' || + c == '\v' || + c == '\xa0' || + iswspace (fromIntegral (ord c)) /= 0 + +-- | Selects upper-case or title-case alphabetic Unicode characters (letters). +-- Title case is used by a small number of letter ligatures like the +-- single-character form of /Lj/. isUpper :: Char -> Bool -- | Selects lower-case alphabetic Unicode characters (letters). isLower :: Char -> Bool --- | Selects alphabetic Unicode characters (letters). +-- | Selects alphabetic Unicode characters (lower-case, upper-case and +-- title-case letters, plus letters of caseless scripts and modifiers letters). +-- This function is equivalent to 'Data.Char.isLetter'. isAlpha :: Char -> Bool -- | Selects alphabetic or numeric digit Unicode characters. @@ -195,27 +100,32 @@ isAlphaNum :: Char -> Bool -- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@. isDigit :: Char -> Bool +isDigit c = c >= '0' && c <= '9' -- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@. isOctDigit :: Char -> Bool -isOctDigit c = c >= '0' && c <= '7' +isOctDigit c = c >= '0' && c <= '7' -- | Selects ASCII hexadecimal digits, -- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@. isHexDigit :: Char -> Bool -isHexDigit c = isDigit c || c >= 'A' && c <= 'F' || +isHexDigit c = isDigit c || c >= 'A' && c <= 'F' || c >= 'a' && c <= 'f' --- | Convert a letter to the corresponding upper-case letter, leaving any --- other character unchanged. Any Unicode letter which has an upper-case --- equivalent is transformed. +-- | Convert a letter to the corresponding upper-case letter, if any. +-- Any other character is returned unchanged. toUpper :: Char -> Char --- | Convert a letter to the corresponding lower-case letter, leaving any --- other character unchanged. Any Unicode letter which has a lower-case --- equivalent is transformed. +-- | Convert a letter to the corresponding lower-case letter, if any. +-- Any other character is returned unchanged. toLower :: Char -> Char +-- | Convert a letter to the corresponding title-case or upper-case +-- letter, if any. (Title case differs from upper case only for a small +-- number of ligature letters.) +-- Any other character is returned unchanged. +toTitle :: Char -> Char + -- ----------------------------------------------------------------------------- -- Implementation with the supplied auto-generated Unicode character properties -- table (default) @@ -224,9 +134,6 @@ toLower :: Char -> Char -- Regardless of the O/S and Library, use the functions contained in WCsubst.c -type WInt = HTYPE_WINT_T - -isDigit c = iswdigit (fromIntegral (ord c)) /= 0 isAlpha c = iswalpha (fromIntegral (ord c)) /= 0 isAlphaNum c = iswalnum (fromIntegral (ord c)) /= 0 --isSpace c = iswspace (fromIntegral (ord c)) /= 0 @@ -239,9 +146,6 @@ toLower c = chr (fromIntegral (towlower (fromIntegral (ord c)))) toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c)))) toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c)))) -foreign import ccall unsafe "u_iswdigit" - iswdigit :: CInt -> CInt - foreign import ccall unsafe "u_iswalpha" iswalpha :: CInt -> CInt @@ -273,30 +177,29 @@ foreign import ccall unsafe "u_towtitle" towtitle :: CInt -> CInt foreign import ccall unsafe "u_gencat" - wgencat :: CInt -> Int + wgencat :: CInt -> CInt -- ----------------------------------------------------------------------------- -- No libunicode, so fall back to the ASCII-only implementation (never used, indeed) #else -isControl c = c < ' ' || c >= '\DEL' && c <= '\x9f' -isPrint c = not (isControl c) +isControl c = c < ' ' || c >= '\DEL' && c <= '\x9f' +isPrint c = not (isControl c) -- The upper case ISO characters have the multiplication sign dumped -- randomly in the middle of the range. Go figure. -isUpper c = c >= 'A' && c <= 'Z' || +isUpper c = c >= 'A' && c <= 'Z' || c >= '\xC0' && c <= '\xD6' || c >= '\xD8' && c <= '\xDE' -- The lower case ISO characters have the division sign dumped -- randomly in the middle of the range. Go figure. -isLower c = c >= 'a' && c <= 'z' || +isLower c = c >= 'a' && c <= 'z' || c >= '\xDF' && c <= '\xF6' || c >= '\xF8' && c <= '\xFF' -isAlpha c = isLower c || isUpper c -isDigit c = c >= '0' && c <= '9' -isAlphaNum c = isAlpha c || isDigit c +isAlpha c = isLower c || isUpper c +isAlphaNum c = isAlpha c || isDigit c -- Case-changing operations @@ -304,7 +207,7 @@ toUpper c@(C# c#) | isAsciiLower c = C# (chr# (ord# c# -# 32#)) | isAscii c = c -- fall-through to the slower stuff. - | isLower c && c /= '\xDF' && c /= '\xFF' + | isLower c && c /= '\xDF' && c /= '\xFF' = unsafeChr (ord c `minusInt` ord 'a' `plusInt` ord 'A') | otherwise = c @@ -313,8 +216,8 @@ toUpper c@(C# c#) toLower c@(C# c#) | isAsciiUpper c = C# (chr# (ord# c# +# 32#)) | isAscii c = c - | isUpper c = unsafeChr (ord c `minusInt` ord 'A' `plusInt` ord 'a') - | otherwise = c + | isUpper c = unsafeChr (ord c `minusInt` ord 'A' `plusInt` ord 'a') + | otherwise = c #endif