-- | Unicode characters are divided into letters, numbers, marks,
-- punctuation, symbols, separators (including spaces) and others
-- (including control characters).
- , isAscii, isLatin1, isControl, isSpace
- , isLower, isUpper, isAlpha, isAlphaNum, isPrint
+ , isControl, isSpace
+ , isLower, isUpper, isAlpha, isAlphaNum, isPrint
, isDigit, isOctDigit, isHexDigit
- , isAsciiUpper, isAsciiLower
, isLetter, isMark, isNumber, isPunctuation, isSymbol, isSeparator
+ -- ** Subranges
+ , isAscii, isLatin1
+ , isAsciiUpper, isAsciiLower
+
+ -- ** Unicode general categories
, GeneralCategory(..), generalCategory
-- * Case conversion
| NotAssigned -- ^ Cn: Other, Not Assigned
deriving (Eq, Ord, Enum, Read, Show, Bounded)
--- | Retrieves the general Unicode category of the character.
+-- | The Unicode general category of the character.
generalCategory :: Char -> GeneralCategory
#if defined(__GLASGOW_HASKELL__) || defined(__NHC__)
generalCategory c = toEnum (wgencat (fromIntegral (ord c)))
-- derived character classifiers
+-- | Selects alphabetic Unicode characters (lower-case, upper-case and
+-- title-case letters, plus letters of caseless scripts and modifiers letters).
+-- This function is equivalent to 'Data.Char.isAlpha'.
isLetter :: Char -> Bool
isLetter c = case generalCategory c of
UppercaseLetter -> True
OtherLetter -> True
_ -> False
+-- | Selects Unicode mark characters, e.g. accents and the like, which
+-- combine with preceding letters.
isMark :: Char -> Bool
isMark c = case generalCategory c of
NonSpacingMark -> True
EnclosingMark -> True
_ -> False
+-- | Selects Unicode numeric characters, including digits from various
+-- scripts, Roman numerals, etc.
isNumber :: Char -> Bool
isNumber c = case generalCategory c of
DecimalNumber -> True
OtherNumber -> True
_ -> False
+-- | Selects Unicode punctuation characters, including various kinds
+-- of connectors, brackets and quotes.
isPunctuation :: Char -> Bool
isPunctuation c = case generalCategory c of
ConnectorPunctuation -> True
OtherPunctuation -> True
_ -> False
+-- | Selects Unicode symbol characters, including mathematical and
+-- currency symbols.
isSymbol :: Char -> Bool
isSymbol c = case generalCategory c of
MathSymbol -> True
OtherSymbol -> True
_ -> False
+-- | Selects Unicode space and separator characters.
isSeparator :: Char -> Bool
isSeparator c = case generalCategory c of
Space -> True
type String = [Char]
{-| The character type 'Char' is an enumeration whose values represent
-Unicode (or equivalently ISO 10646) characters.
+Unicode (or equivalently ISO\/IEC 10646) characters
+(see <http://www.unicode.org/> for details).
This set extends the ISO 8859-1 (Latin-1) character set
(the first 256 charachers), which is itself an extension of the ASCII
character set (the first 128 characters).
--
-----------------------------------------------------------------------------
+-- #hide
module GHC.Unicode (
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isLatin1 :: Char -> Bool
isLatin1 c = c <= '\xff'
-isAsciiUpper, isAsciiLower :: Char -> Bool
+-- | Selects ASCII lower-case letters,
+-- i.e. characters satisfying both 'isAscii' and 'isLower'.
+isAsciiLower :: Char -> Bool
isAsciiLower c = c >= 'a' && c <= 'z'
+
+-- | Selects ASCII upper-case letters,
+-- i.e. characters satisfying both 'isAscii' and 'isUpper'.
+isAsciiUpper :: Char -> Bool
isAsciiUpper c = c >= 'A' && c <= 'Z'
-- | Selects control characters, which are the non-printing characters of
c == '\xa0' ||
iswspace (fromIntegral (ord c)) /= 0
--- | Selects alphabetic Unicode characters (letters) that are not lower-case.
--- (In Unicode terms, this includes letters in upper and title cases,
--- as well as modifier letters and other letters.)
+-- | Selects upper-case or title-case alphabetic Unicode characters (letters).
+-- Title case is used by a small number of letter ligatures like the
+-- single-character form of /Lj/.
isUpper :: Char -> Bool
-- | Selects lower-case alphabetic Unicode characters (letters).
isLower :: Char -> Bool
--- | Selects alphabetic Unicode characters (letters).
+-- | Selects alphabetic Unicode characters (lower-case, upper-case and
+-- title-case letters, plus letters of caseless scripts and modifiers letters).
+-- This function is equivalent to 'Data.Char.isLetter'.
isAlpha :: Char -> Bool
-- | Selects alphabetic or numeric digit Unicode characters.
isHexDigit c = isDigit c || c >= 'A' && c <= 'F' ||
c >= 'a' && c <= 'f'
--- | Convert a letter to the corresponding upper-case letter, leaving any
--- other character unchanged. Any Unicode letter which has an upper-case
--- equivalent is transformed.
+-- | Convert a letter to the corresponding upper-case letter, if any.
+-- Any other character is returned unchanged.
toUpper :: Char -> Char
--- | Convert a letter to the corresponding lower-case letter, leaving any
--- other character unchanged. Any Unicode letter which has a lower-case
--- equivalent is transformed.
+-- | Convert a letter to the corresponding lower-case letter, if any.
+-- Any other character is returned unchanged.
toLower :: Char -> Char
--- | Convert a letter to the corresponding title-case letter, leaving any
--- other character unchanged. Any Unicode letter which has a lower-case
--- equivalent is transformed.
+-- | Convert a letter to the corresponding title-case or upper-case
+-- letter, if any. (Title case differs from upper case only for a small
+-- number of ligature letters.)
+-- Any other character is returned unchanged.
toTitle :: Char -> Char
-- -----------------------------------------------------------------------------