GHC/Unicode.hs

   1 {-# OPTIONS_GHC -XNoImplicitPrelude #-}
   2 {-# OPTIONS -#include "WCsubst.h" #-}
   3 {-# OPTIONS_HADDOCK hide #-}
   4 -----------------------------------------------------------------------------
   5 -- |
   6 -- Module      :  GHC.Unicode
   7 -- Copyright   :  (c) The University of Glasgow, 2003
   8 -- License     :  see libraries/base/LICENSE
   9 --
  10 -- Maintainer  :  cvs-ghc@haskell.org
  11 -- Stability   :  internal
  12 -- Portability :  non-portable (GHC extensions)
  13 --
  14 -- Implementations for the character predicates (isLower, isUpper, etc.)
  15 -- and the conversions (toUpper, toLower).  The implementation uses
  16 -- libunicode on Unix systems if that is available.
  17 --
  18 -----------------------------------------------------------------------------
  19
  20 -- #hide
  21 module GHC.Unicode (
  22     isAscii, isLatin1, isControl,
  23     isAsciiUpper, isAsciiLower,
  24     isPrint, isSpace,  isUpper,
  25     isLower, isAlpha,  isDigit,
  26     isOctDigit, isHexDigit, isAlphaNum,
  27     toUpper, toLower, toTitle,
  28     wgencat,
  29   ) where
  30
  31 import GHC.Base
  32 import GHC.Real        (fromIntegral)
  33 import Foreign.C.Types (CInt)
  34
  35 #include "HsBaseConfig.h"
  36
  37 -- | Selects the first 128 characters of the Unicode character set,
  38 -- corresponding to the ASCII character set.
  39 isAscii                 :: Char -> Bool
  40 isAscii c               =  c <  '\x80'
  41
  42 -- | Selects the first 256 characters of the Unicode character set,
  43 -- corresponding to the ISO 8859-1 (Latin-1) character set.
  44 isLatin1                :: Char -> Bool
  45 isLatin1 c              =  c <= '\xff'
  46
  47 -- | Selects ASCII lower-case letters,
  48 -- i.e. characters satisfying both 'isAscii' and 'isLower'.
  49 isAsciiLower :: Char -> Bool
  50 isAsciiLower c          =  c >= 'a' && c <= 'z'
  51
  52 -- | Selects ASCII upper-case letters,
  53 -- i.e. characters satisfying both 'isAscii' and 'isUpper'.
  54 isAsciiUpper :: Char -> Bool
  55 isAsciiUpper c          =  c >= 'A' && c <= 'Z'
  56
  57 -- | Selects control characters, which are the non-printing characters of
  58 -- the Latin-1 subset of Unicode.
  59 isControl               :: Char -> Bool
  60
  61 -- | Selects printable Unicode characters
  62 -- (letters, numbers, marks, punctuation, symbols and spaces).
  63 isPrint                 :: Char -> Bool
  64
  65 -- | Returns 'True' for any Unicode space character, and the control
  66 -- characters @\\t@, @\\n@, @\\r@, @\\f@, @\\v@.
  67 isSpace                 :: Char -> Bool
  68 -- isSpace includes non-breaking space
  69 -- Done with explicit equalities both for efficiency, and to avoid a tiresome
  70 -- recursion with GHC.List elem
  71 isSpace c               =  c == ' '     ||
  72                            c == '\t'    ||
  73                            c == '\n'    ||
  74                            c == '\r'    ||
  75                            c == '\f'    ||
  76                            c == '\v'    ||
  77                            c == '\xa0'  ||
  78                            iswspace (fromIntegral (ord c)) /= 0
  79
  80 -- | Selects upper-case or title-case alphabetic Unicode characters (letters).
  81 -- Title case is used by a small number of letter ligatures like the
  82 -- single-character form of /Lj/.
  83 isUpper                 :: Char -> Bool
  84
  85 -- | Selects lower-case alphabetic Unicode characters (letters).
  86 isLower                 :: Char -> Bool
  87
  88 -- | Selects alphabetic Unicode characters (lower-case, upper-case and
  89 -- title-case letters, plus letters of caseless scripts and modifiers letters).
  90 -- This function is equivalent to 'Data.Char.isLetter'.
  91 isAlpha                 :: Char -> Bool
  92
  93 -- | Selects alphabetic or numeric digit Unicode characters.
  94 --
  95 -- Note that numeric digits outside the ASCII range are selected by this
  96 -- function but not by 'isDigit'.  Such digits may be part of identifiers
  97 -- but are not used by the printer and reader to represent numbers.
  98 isAlphaNum              :: Char -> Bool
  99
 100 -- | Selects ASCII digits, i.e. @\'0\'@..@\'9\'@.
 101 isDigit                 :: Char -> Bool
 102 isDigit c               =  c >= '0' && c <= '9'
 103
 104 -- | Selects ASCII octal digits, i.e. @\'0\'@..@\'7\'@.
 105 isOctDigit              :: Char -> Bool
 106 isOctDigit c            =  c >= '0' && c <= '7'
 107
 108 -- | Selects ASCII hexadecimal digits,
 109 -- i.e. @\'0\'@..@\'9\'@, @\'a\'@..@\'f\'@, @\'A\'@..@\'F\'@.
 110 isHexDigit              :: Char -> Bool
 111 isHexDigit c            =  isDigit c || c >= 'A' && c <= 'F' ||
 112                                         c >= 'a' && c <= 'f'
 113
 114 -- | Convert a letter to the corresponding upper-case letter, if any.
 115 -- Any other character is returned unchanged.
 116 toUpper                 :: Char -> Char
 117
 118 -- | Convert a letter to the corresponding lower-case letter, if any.
 119 -- Any other character is returned unchanged.
 120 toLower                 :: Char -> Char
 121
 122 -- | Convert a letter to the corresponding title-case or upper-case
 123 -- letter, if any.  (Title case differs from upper case only for a small
 124 -- number of ligature letters.)
 125 -- Any other character is returned unchanged.
 126 toTitle                 :: Char -> Char
 127
 128 -- -----------------------------------------------------------------------------
 129 -- Implementation with the supplied auto-generated Unicode character properties
 130 -- table (default)
 131
 132 #if 1
 133
 134 -- Regardless of the O/S and Library, use the functions contained in WCsubst.c
 135
 136 isAlpha    c = iswalpha (fromIntegral (ord c)) /= 0
 137 isAlphaNum c = iswalnum (fromIntegral (ord c)) /= 0
 138 --isSpace    c = iswspace (fromIntegral (ord c)) /= 0
 139 isControl  c = iswcntrl (fromIntegral (ord c)) /= 0
 140 isPrint    c = iswprint (fromIntegral (ord c)) /= 0
 141 isUpper    c = iswupper (fromIntegral (ord c)) /= 0
 142 isLower    c = iswlower (fromIntegral (ord c)) /= 0
 143
 144 toLower c = chr (fromIntegral (towlower (fromIntegral (ord c))))
 145 toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c))))
 146 toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c))))
 147
 148 foreign import ccall unsafe "u_iswalpha"
 149   iswalpha :: CInt -> CInt
 150
 151 foreign import ccall unsafe "u_iswalnum"
 152   iswalnum :: CInt -> CInt
 153
 154 foreign import ccall unsafe "u_iswcntrl"
 155   iswcntrl :: CInt -> CInt
 156
 157 foreign import ccall unsafe "u_iswspace"
 158   iswspace :: CInt -> CInt
 159
 160 foreign import ccall unsafe "u_iswprint"
 161   iswprint :: CInt -> CInt
 162
 163 foreign import ccall unsafe "u_iswlower"
 164   iswlower :: CInt -> CInt
 165
 166 foreign import ccall unsafe "u_iswupper"
 167   iswupper :: CInt -> CInt
 168
 169 foreign import ccall unsafe "u_towlower"
 170   towlower :: CInt -> CInt
 171
 172 foreign import ccall unsafe "u_towupper"
 173   towupper :: CInt -> CInt
 174
 175 foreign import ccall unsafe "u_towtitle"
 176   towtitle :: CInt -> CInt
 177
 178 foreign import ccall unsafe "u_gencat"
 179   wgencat :: CInt -> CInt
 180
 181 -- -----------------------------------------------------------------------------
 182 -- No libunicode, so fall back to the ASCII-only implementation (never used, indeed)
 183
 184 #else
 185
 186 isControl c             =  c < ' ' || c >= '\DEL' && c <= '\x9f'
 187 isPrint c               =  not (isControl c)
 188
 189 -- The upper case ISO characters have the multiplication sign dumped
 190 -- randomly in the middle of the range.  Go figure.
 191 isUpper c               =  c >= 'A' && c <= 'Z' ||
 192                            c >= '\xC0' && c <= '\xD6' ||
 193                            c >= '\xD8' && c <= '\xDE'
 194 -- The lower case ISO characters have the division sign dumped
 195 -- randomly in the middle of the range.  Go figure.
 196 isLower c               =  c >= 'a' && c <= 'z' ||
 197                            c >= '\xDF' && c <= '\xF6' ||
 198                            c >= '\xF8' && c <= '\xFF'
 199
 200 isAlpha c               =  isLower c || isUpper c
 201 isAlphaNum c            =  isAlpha c || isDigit c
 202
 203 -- Case-changing operations
 204
 205 toUpper c@(C# c#)
 206   | isAsciiLower c    = C# (chr# (ord# c# -# 32#))
 207   | isAscii c         = c
 208     -- fall-through to the slower stuff.
 209   | isLower c   && c /= '\xDF' && c /= '\xFF'
 210   = unsafeChr (ord c `minusInt` ord 'a' `plusInt` ord 'A')
 211   | otherwise
 212   = c
 213
 214
 215 toLower c@(C# c#)
 216   | isAsciiUpper c = C# (chr# (ord# c# +# 32#))
 217   | isAscii c      = c
 218   | isUpper c      = unsafeChr (ord c `minusInt` ord 'A' `plusInt` ord 'a')
 219   | otherwise      =  c
 220
 221 #endif
 222