X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=GHC%2FUnicode.hs;h=8aefad5f2eb1aaae32bced502c09f09aeba66bfa;hb=1e076454862442966918e6b1b9095e43eab7c770;hp=52d14f37c8964eea9a20bd29ab2472cf7b62c924;hpb=69b2983d1ef10fefe155db9f777b9201e4c5b447;p=ghc-base.git diff --git a/GHC/Unicode.hs b/GHC/Unicode.hs index 52d14f3..8aefad5 100644 --- a/GHC/Unicode.hs +++ b/GHC/Unicode.hs @@ -1,4 +1,5 @@ -{-# OPTIONS_GHC -fno-implicit-prelude #-} +{-# OPTIONS -fno-implicit-prelude #-} +{-# OPTIONS -#include "WCsubst.h" #-} ----------------------------------------------------------------------------- -- | -- Module : GHC.Unicde @@ -15,14 +16,21 @@ -- ----------------------------------------------------------------------------- --- #hide module GHC.Unicode ( + GeneralCategory (..), + generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, - toUpper, toLower, + toUpper, toLower, toTitle, + isLetter, -- :: Char -> Bool + isMark, -- :: Char -> Bool + isNumber, -- :: Char -> Bool + isPunctuation, -- :: Char -> Bool + isSymbol, -- :: Char -> Bool + isSeparator, -- :: Char -> Bool ) where import GHC.Base @@ -30,9 +38,106 @@ import GHC.Real (fromIntegral) import GHC.Int import GHC.Word import GHC.Num (fromInteger) +import GHC.Read +import GHC.Show +import GHC.Enum #include "HsBaseConfig.h" +-- | Unicode General Categories (column 2 of the UnicodeData table) +-- in the order they are listed in the Unicode standard. + +data GeneralCategory + = UppercaseLetter -- Lu Letter, Uppercase + | LowercaseLetter -- Ll Letter, Lowercase + | TitlecaseLetter -- Lt Letter, Titlecase + | ModifierLetter -- Lm Letter, Modifier + | OtherLetter -- Lo Letter, Other + | NonSpacingMark -- Mn Mark, Non-Spacing + | SpacingCombiningMark -- Mc Mark, Spacing Combining + | EnclosingMark -- Me Mark, Enclosing + | DecimalNumber -- Nd Number, Decimal + | LetterNumber -- Nl Number, Letter + | OtherNumber -- No Number, Other + | ConnectorPunctuation -- Pc Punctuation, Connector + | DashPunctuation -- Pd Punctuation, Dash + | OpenPunctuation -- Ps Punctuation, Open + | ClosePunctuation -- Pe Punctuation, Close + | InitialQuote -- Pi Punctuation, Initial quote + | FinalQuote -- Pf Punctuation, Final quote + | OtherPunctuation -- Po Punctuation, Other + | MathSymbol -- Sm Symbol, Math + | CurrencySymbol -- Sc Symbol, Currency + | ModifierSymbol -- Sk Symbol, Modifier + | OtherSymbol -- So Symbol, Other + | Space -- Zs Separator, Space + | LineSeparator -- Zl Separator, Line + | ParagraphSeparator -- Zp Separator, Paragraph + | Control -- Cc Other, Control + | Format -- Cf Other, Format + | Surrogate -- Cs Other, Surrogate + | PrivateUse -- Co Other, Private Use + | NotAssigned -- Cn Other, Not Assigned + deriving (Eq, Ord, Enum, Read, Show, Bounded) + +-- | Retrieves the general Unicode category of the character. +generalCategory :: Char -> GeneralCategory +generalCategory c = toEnum (wgencat (fromIntegral (ord c))) + +-- ------------------------------------------------------------------------ +-- These are copied from Hugs Unicode.hs + +-- derived character classifiers + +isLetter :: Char -> Bool +isLetter c = case generalCategory c of + UppercaseLetter -> True + LowercaseLetter -> True + TitlecaseLetter -> True + ModifierLetter -> True + OtherLetter -> True + _ -> False + +isMark :: Char -> Bool +isMark c = case generalCategory c of + NonSpacingMark -> True + SpacingCombiningMark -> True + EnclosingMark -> True + _ -> False + +isNumber :: Char -> Bool +isNumber c = case generalCategory c of + DecimalNumber -> True + LetterNumber -> True + OtherNumber -> True + _ -> False + +isPunctuation :: Char -> Bool +isPunctuation c = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OpenPunctuation -> True + ClosePunctuation -> True + InitialQuote -> True + FinalQuote -> True + OtherPunctuation -> True + _ -> False + +isSymbol :: Char -> Bool +isSymbol c = case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + +isSeparator :: Char -> Bool +isSeparator c = case generalCategory c of + Space -> True + LineSeparator -> True + ParagraphSeparator -> True + _ -> False + -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool @@ -67,7 +172,8 @@ isSpace c = c == ' ' || c == '\r' || c == '\f' || c == '\v' || - c == '\xa0' + c == '\xa0' || + iswspace (fromIntegral (ord c)) /= 0 -- | Selects alphabetic Unicode characters (letters) that are not lower-case. -- (In Unicode terms, this includes letters in upper and title cases, @@ -78,15 +184,6 @@ isUpper :: Char -> Bool isLower :: Char -> Bool -- | Selects alphabetic Unicode characters (letters). --- --- Note: the Haskell 98 definition of 'isAlpha' is: --- --- > isAlpha c = isUpper c || isLower c --- --- the implementation here diverges from the Haskell 98 --- definition in the sense that Unicode alphabetic characters which --- are neither upper nor lower case will still be identified as --- alphabetic by 'isAlpha'. isAlpha :: Char -> Bool -- | Selects alphabetic or numeric digit Unicode characters. @@ -120,21 +217,14 @@ toUpper :: Char -> Char toLower :: Char -> Char -- ----------------------------------------------------------------------------- --- Win32 implementation +-- Implementation with the supplied auto-generated Unicode character properties +-- table (default) -#if (defined(HAVE_WCTYPE_H) && HAVE_ISWSPACE && defined(HTYPE_WINT_T)) || mingw32_HOST_OS +#if 1 --- Use the wide-char classification functions if available. Glibc --- seems to implement these properly, even for chars > 0xffff, as long --- as you call setlocale() to set the locale to something other than --- "C". Therefore, we call setlocale() in hs_init(). - --- Win32 uses UTF-16, so presumably the system-supplied iswlower() and --- friends won't work properly with characters > 0xffff. These --- characters are represented as surrogate pairs in UTF-16. +-- Regardless of the O/S and Library, use the functions contained in WCsubst.c type WInt = HTYPE_WINT_T -type CInt = HTYPE_INT isDigit c = iswdigit (fromIntegral (ord c)) /= 0 isAlpha c = iswalpha (fromIntegral (ord c)) /= 0 @@ -147,39 +237,46 @@ isLower c = iswlower (fromIntegral (ord c)) /= 0 toLower c = chr (fromIntegral (towlower (fromIntegral (ord c)))) toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c)))) +toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c)))) + +foreign import ccall unsafe "u_iswdigit" + iswdigit :: CInt -> CInt + +foreign import ccall unsafe "u_iswalpha" + iswalpha :: CInt -> CInt -foreign import ccall unsafe "iswdigit" - iswdigit :: WInt -> CInt +foreign import ccall unsafe "u_iswalnum" + iswalnum :: CInt -> CInt -foreign import ccall unsafe "iswalpha" - iswalpha :: WInt -> CInt +foreign import ccall unsafe "u_iswcntrl" + iswcntrl :: CInt -> CInt -foreign import ccall unsafe "iswalnum" - iswalnum :: WInt -> CInt +foreign import ccall unsafe "u_iswspace" + iswspace :: CInt -> CInt -foreign import ccall unsafe "iswcntrl" - iswcntrl :: WInt -> CInt +foreign import ccall unsafe "u_iswprint" + iswprint :: CInt -> CInt -foreign import ccall unsafe "iswspace" - iswspace :: WInt -> CInt +foreign import ccall unsafe "u_iswlower" + iswlower :: CInt -> CInt -foreign import ccall unsafe "iswprint" - iswprint :: WInt -> CInt +foreign import ccall unsafe "u_iswupper" + iswupper :: CInt -> CInt -foreign import ccall unsafe "iswlower" - iswlower :: WInt -> CInt +foreign import ccall unsafe "u_towlower" + towlower :: CInt -> CInt -foreign import ccall unsafe "iswupper" - iswupper :: WInt -> CInt +foreign import ccall unsafe "u_towupper" + towupper :: CInt -> CInt -foreign import ccall unsafe "towlower" - towlower :: WInt -> WInt +foreign import ccall unsafe "u_towtitle" + towtitle :: CInt -> CInt -foreign import ccall unsafe "towupper" - towupper :: WInt -> WInt +foreign import ccall unsafe "u_gencat" + wgencat :: CInt -> Int -- ----------------------------------------------------------------------------- --- No libunicode, so fall back to the ASCII-only implementation +-- No libunicode, so fall back to the ASCII-only implementation (never used, indeed) #else