-{-# OPTIONS_GHC -fno-implicit-prelude #-}
+{-# OPTIONS -fno-implicit-prelude #-}
+{-# OPTIONS -#include "WCsubst.h" #-}
-----------------------------------------------------------------------------
-- |
-- Module : GHC.Unicde
-----------------------------------------------------------------------------
module GHC.Unicode (
+ GeneralCategory (..),
+ generalCategory,
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isPrint, isSpace, isUpper,
isLower, isAlpha, isDigit,
isOctDigit, isHexDigit, isAlphaNum,
- toUpper, toLower,
+ toUpper, toLower, toTitle,
+ isLetter, -- :: Char -> Bool
+ isMark, -- :: Char -> Bool
+ isNumber, -- :: Char -> Bool
+ isPunctuation, -- :: Char -> Bool
+ isSymbol, -- :: Char -> Bool
+ isSeparator, -- :: Char -> Bool
) where
import GHC.Base
import GHC.Int
import GHC.Word
import GHC.Num (fromInteger)
+import GHC.Read
+import GHC.Show
+import GHC.Enum
-#include "ghcconfig.h"
#include "HsBaseConfig.h"
+-- | Unicode General Categories (column 2 of the UnicodeData table)
+-- in the order they are listed in the Unicode standard.
+
+data GeneralCategory
+ = UppercaseLetter -- Lu Letter, Uppercase
+ | LowercaseLetter -- Ll Letter, Lowercase
+ | TitlecaseLetter -- Lt Letter, Titlecase
+ | ModifierLetter -- Lm Letter, Modifier
+ | OtherLetter -- Lo Letter, Other
+ | NonSpacingMark -- Mn Mark, Non-Spacing
+ | SpacingCombiningMark -- Mc Mark, Spacing Combining
+ | EnclosingMark -- Me Mark, Enclosing
+ | DecimalNumber -- Nd Number, Decimal
+ | LetterNumber -- Nl Number, Letter
+ | OtherNumber -- No Number, Other
+ | ConnectorPunctuation -- Pc Punctuation, Connector
+ | DashPunctuation -- Pd Punctuation, Dash
+ | OpenPunctuation -- Ps Punctuation, Open
+ | ClosePunctuation -- Pe Punctuation, Close
+ | InitialQuote -- Pi Punctuation, Initial quote
+ | FinalQuote -- Pf Punctuation, Final quote
+ | OtherPunctuation -- Po Punctuation, Other
+ | MathSymbol -- Sm Symbol, Math
+ | CurrencySymbol -- Sc Symbol, Currency
+ | ModifierSymbol -- Sk Symbol, Modifier
+ | OtherSymbol -- So Symbol, Other
+ | Space -- Zs Separator, Space
+ | LineSeparator -- Zl Separator, Line
+ | ParagraphSeparator -- Zp Separator, Paragraph
+ | Control -- Cc Other, Control
+ | Format -- Cf Other, Format
+ | Surrogate -- Cs Other, Surrogate
+ | PrivateUse -- Co Other, Private Use
+ | NotAssigned -- Cn Other, Not Assigned
+ deriving (Eq, Ord, Enum, Read, Show, Bounded)
+
+-- | Retrieves the general Unicode category of the character.
+generalCategory :: Char -> GeneralCategory
+generalCategory c = toEnum (wgencat (fromIntegral (ord c)))
+
+-- ------------------------------------------------------------------------
+-- These are copied from Hugs Unicode.hs
+
+-- derived character classifiers
+
+isLetter :: Char -> Bool
+isLetter c = case generalCategory c of
+ UppercaseLetter -> True
+ LowercaseLetter -> True
+ TitlecaseLetter -> True
+ ModifierLetter -> True
+ OtherLetter -> True
+ _ -> False
+
+isMark :: Char -> Bool
+isMark c = case generalCategory c of
+ NonSpacingMark -> True
+ SpacingCombiningMark -> True
+ EnclosingMark -> True
+ _ -> False
+
+isNumber :: Char -> Bool
+isNumber c = case generalCategory c of
+ DecimalNumber -> True
+ LetterNumber -> True
+ OtherNumber -> True
+ _ -> False
+
+isPunctuation :: Char -> Bool
+isPunctuation c = case generalCategory c of
+ ConnectorPunctuation -> True
+ DashPunctuation -> True
+ OpenPunctuation -> True
+ ClosePunctuation -> True
+ InitialQuote -> True
+ FinalQuote -> True
+ OtherPunctuation -> True
+ _ -> False
+
+isSymbol :: Char -> Bool
+isSymbol c = case generalCategory c of
+ MathSymbol -> True
+ CurrencySymbol -> True
+ ModifierSymbol -> True
+ OtherSymbol -> True
+ _ -> False
+
+isSeparator :: Char -> Bool
+isSeparator c = case generalCategory c of
+ Space -> True
+ LineSeparator -> True
+ ParagraphSeparator -> True
+ _ -> False
+
-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
c == '\r' ||
c == '\f' ||
c == '\v' ||
- c == '\xa0'
+ c == '\xa0' ||
+ iswspace (fromIntegral (ord c)) /= 0
-- | Selects alphabetic Unicode characters (letters) that are not lower-case.
-- (In Unicode terms, this includes letters in upper and title cases,
isLower :: Char -> Bool
-- | Selects alphabetic Unicode characters (letters).
---
--- Note: the Haskell 98 definition of 'isAlpha' is:
---
--- > isAlpha c = isUpper c || isLower c
---
--- the implementation here diverges from the Haskell 98
--- definition in the sense that Unicode alphabetic characters which
--- are neither upper nor lower case will still be identified as
--- alphabetic by 'isAlpha'.
isAlpha :: Char -> Bool
-- | Selects alphabetic or numeric digit Unicode characters.
-- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
isDigit :: Char -> Bool
+isDigit c = c >= '0' && c <= '9'
-- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
isOctDigit :: Char -> Bool
-- equivalent is transformed.
toLower :: Char -> Char
--- -----------------------------------------------------------------------------
--- Win32 implementation
+-- | Convert a letter to the corresponding title-case letter, leaving any
+-- other character unchanged. Any Unicode letter which has a lower-case
+-- equivalent is transformed.
+toTitle :: Char -> Char
-#if (defined(HAVE_WCTYPE_H) && HAVE_ISWSPACE && defined(HTYPE_WINT_T)) || mingw32_TARGET_OS
+-- -----------------------------------------------------------------------------
+-- Implementation with the supplied auto-generated Unicode character properties
+-- table (default)
--- Use the wide-char classification functions if available. Glibc
--- seems to implement these properly, even for chars > 0xffff, as long
--- as you call setlocale() to set the locale to something other than
--- "C". Therefore, we call setlocale() in hs_init().
+#if 1
--- Win32 uses UTF-16, so presumably the system-supplied iswlower() and
--- friends won't work properly with characters > 0xffff. These
--- characters are represented as surrogate pairs in UTF-16.
+-- Regardless of the O/S and Library, use the functions contained in WCsubst.c
type WInt = HTYPE_WINT_T
type CInt = HTYPE_INT
-isDigit c = iswdigit (fromIntegral (ord c)) /= 0
isAlpha c = iswalpha (fromIntegral (ord c)) /= 0
isAlphaNum c = iswalnum (fromIntegral (ord c)) /= 0
--isSpace c = iswspace (fromIntegral (ord c)) /= 0
toLower c = chr (fromIntegral (towlower (fromIntegral (ord c))))
toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c))))
+toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c))))
-foreign import ccall unsafe "iswdigit"
- iswdigit :: WInt -> CInt
+foreign import ccall unsafe "u_iswdigit"
+ iswdigit :: CInt -> CInt
-foreign import ccall unsafe "iswalpha"
- iswalpha :: WInt -> CInt
+foreign import ccall unsafe "u_iswalpha"
+ iswalpha :: CInt -> CInt
-foreign import ccall unsafe "iswalnum"
- iswalnum :: WInt -> CInt
+foreign import ccall unsafe "u_iswalnum"
+ iswalnum :: CInt -> CInt
-foreign import ccall unsafe "iswcntrl"
- iswcntrl :: WInt -> CInt
+foreign import ccall unsafe "u_iswcntrl"
+ iswcntrl :: CInt -> CInt
-foreign import ccall unsafe "iswspace"
- iswspace :: WInt -> CInt
+foreign import ccall unsafe "u_iswspace"
+ iswspace :: CInt -> CInt
-foreign import ccall unsafe "iswprint"
- iswprint :: WInt -> CInt
+foreign import ccall unsafe "u_iswprint"
+ iswprint :: CInt -> CInt
-foreign import ccall unsafe "iswlower"
- iswlower :: WInt -> CInt
+foreign import ccall unsafe "u_iswlower"
+ iswlower :: CInt -> CInt
-foreign import ccall unsafe "iswupper"
- iswupper :: WInt -> CInt
+foreign import ccall unsafe "u_iswupper"
+ iswupper :: CInt -> CInt
-foreign import ccall unsafe "towlower"
- towlower :: WInt -> WInt
+foreign import ccall unsafe "u_towlower"
+ towlower :: CInt -> CInt
-foreign import ccall unsafe "towupper"
- towupper :: WInt -> WInt
+foreign import ccall unsafe "u_towupper"
+ towupper :: CInt -> CInt
+
+foreign import ccall unsafe "u_towtitle"
+ towtitle :: CInt -> CInt
+
+foreign import ccall unsafe "u_gencat"
+ wgencat :: CInt -> Int
-- -----------------------------------------------------------------------------
--- No libunicode, so fall back to the ASCII-only implementation
+-- No libunicode, so fall back to the ASCII-only implementation (never used, indeed)
#else
c >= '\xF8' && c <= '\xFF'
isAlpha c = isLower c || isUpper c
-isDigit c = c >= '0' && c <= '9'
isAlphaNum c = isAlpha c || isDigit c
-- Case-changing operations