From 14c3d7f368a6ec5e760b2fd57218171eeaf53a29 Mon Sep 17 00:00:00 2001 From: simonmar Date: Tue, 19 Aug 2003 16:39:14 +0000 Subject: [PATCH] [project @ 2003-08-19 16:39:13 by simonmar] Use the wide-char classifications from the C library if available. This gives us Unicode-aware isLower, isUpper, isAlpha etc. On Unix, you have to set your locale to something. This is usually done by setting the environment variable LANG, eg. export LANG=en This stuff *should* also work on Windows, except that Windows uses a 16-bit wchar_t so will get it wrong for characters > '\xffff'. However, I figured it was better to use the system-supplied functionality rather than trying to implement this stuff ourselves. --- Data/Char.hs | 10 +++ Data/List.hs | 48 +++++++++++- GHC/Read.lhs | 5 +- GHC/Show.lhs | 157 ++++----------------------------------- Text/ParserCombinators/ReadP.hs | 2 +- Text/Read/Lex.hs | 3 +- include/HsBase.h | 5 +- 7 files changed, 81 insertions(+), 149 deletions(-) diff --git a/Data/Char.hs b/Data/Char.hs index 2800684..001c83f 100644 --- a/Data/Char.hs +++ b/Data/Char.hs @@ -42,6 +42,8 @@ module Data.Char import GHC.Base import GHC.Show import GHC.Read (readLitChar, lexLitChar) +import GHC.Unicode +import GHC.Num #endif #ifdef __HUGS__ @@ -53,3 +55,11 @@ import Prelude import Prelude(Char,String) import Char #endif + + +digitToInt :: Char -> Int +digitToInt c + | isDigit c = ord c - ord '0' + | c >= 'a' && c <= 'f' = ord c - ord 'a' + 10 + | c >= 'A' && c <= 'F' = ord c - ord 'A' + 10 + | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh diff --git a/Data/List.hs b/Data/List.hs index 3e2bead..0a4bb20 100644 --- a/Data/List.hs +++ b/Data/List.hs @@ -141,12 +141,12 @@ import Prelude hiding (Maybe(..)) #endif import Data.Maybe +import Data.Char ( isSpace ) #ifdef __GLASGOW_HASKELL__ import GHC.Num import GHC.Real import GHC.List -import GHC.Show ( lines, words, unlines, unwords ) import GHC.Base #endif @@ -723,3 +723,49 @@ product l = prod l 1 prod (x:xs) a = prod xs (a*x) #endif #endif /* __GLASGOW_HASKELL__ */ + +-- ----------------------------------------------------------------------------- +-- Functions on strings + +-- lines breaks a string up into a list of strings at newline characters. +-- The resulting strings do not contain newlines. Similary, words +-- breaks a string up into a list of words, which were delimited by +-- white space. unlines and unwords are the inverse operations. +-- unlines joins lines with terminating newlines, and unwords joins +-- words with separating spaces. + +lines :: String -> [String] +lines "" = [] +lines s = let (l, s') = break (== '\n') s + in l : case s' of + [] -> [] + (_:s'') -> lines s'' + +unlines :: [String] -> String +#ifdef USE_REPORT_PRELUDE +unlines = concatMap (++ "\n") +#else +-- HBC version (stolen) +-- here's a more efficient version +unlines [] = [] +unlines (l:ls) = l ++ '\n' : unlines ls +#endif + +words :: String -> [String] +words s = case dropWhile {-partain:Char.-}isSpace s of + "" -> [] + s' -> w : words s'' + where (w, s'') = + break {-partain:Char.-}isSpace s' + +unwords :: [String] -> String +#ifdef USE_REPORT_PRELUDE +unwords [] = "" +unwords ws = foldr1 (\w s -> w ++ ' ':s) ws +#else +-- HBC version (stolen) +-- here's a more efficient version +unwords [] = "" +unwords [w] = w +unwords (w:ws) = w ++ ' ' : unwords ws +#endif diff --git a/GHC/Read.lhs b/GHC/Read.lhs index b67e83b..aba24ff 100644 --- a/GHC/Read.lhs +++ b/GHC/Read.lhs @@ -3,7 +3,7 @@ ----------------------------------------------------------------------------- -- | -- Module : GHC.Read --- Copyright : (c) The FFI Task Force, 1994-2002 +-- Copyright : (c) The University of Glasgow, 1994-2002 -- License : see libraries/base/LICENSE -- -- Maintainer : cvs-ghc@haskell.org @@ -65,11 +65,12 @@ import Data.Maybe import Data.Either import {-# SOURCE #-} GHC.Err ( error ) +import {-# SOURCE #-} GHC.Unicode ( isDigit ) import GHC.Num import GHC.Real import GHC.Float import GHC.List -import GHC.Show -- isAlpha etc +import GHC.Show import GHC.Base import GHC.Arr \end{code} diff --git a/GHC/Show.lhs b/GHC/Show.lhs index 1401241..5c0382a 100644 --- a/GHC/Show.lhs +++ b/GHC/Show.lhs @@ -23,17 +23,11 @@ module GHC.Show -- Show support code shows, showChar, showString, showParen, showList__, showSpace, showLitChar, protectEsc, - intToDigit, digitToInt, showSignedInt, + intToDigit, showSignedInt, appPrec, appPrec1, -- Character operations - isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, - isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, - toUpper, toLower, asciiTab, - - -- String operations - lines, unlines, words, unwords ) where @@ -42,7 +36,7 @@ import GHC.Base import GHC.Enum import Data.Maybe import Data.Either -import GHC.List ( (!!), break, dropWhile +import GHC.List ( (!!), #ifdef USE_REPORT_PRELUDE , concatMap, foldr1 #endif @@ -207,7 +201,7 @@ Code specific for characters \begin{code} showLitChar :: Char -> ShowS -showLitChar c s | c > '\DEL' = showChar '\\' (protectEsc isDigit (shows (ord c)) s) +showLitChar c s | c > '\DEL' = showChar '\\' (protectEsc isDec (shows (ord c)) s) showLitChar '\DEL' s = showString "\\DEL" s showLitChar '\\' s = showString "\\\\" s showLitChar c s | c >= ' ' = showChar c s @@ -223,10 +217,21 @@ showLitChar c s = showString ('\\' : asciiTab!!ord c) s -- I've done manual eta-expansion here, becuase otherwise it's -- impossible to stop (asciiTab!!ord) getting floated out as an MFE +isDec c = c >= '0' && c <= '9' + protectEsc :: (Char -> Bool) -> ShowS -> ShowS protectEsc p f = f . cont where cont s@(c:_) | p c = "\\&" ++ s cont s = s + + +asciiTab :: [String] +asciiTab = -- Using an array drags in the array module. listArray ('\NUL', ' ') + ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", + "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", + "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", + "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", + "SP"] \end{code} Code specific for Ints. @@ -238,13 +243,6 @@ intToDigit (I# i) | i >=# 10# && i <=# 15# = unsafeChr (ord 'a' `minusInt` ten `plusInt` I# i) | otherwise = error ("Char.intToDigit: not a digit " ++ show (I# i)) -digitToInt :: Char -> Int -digitToInt c - | isDigit c = ord c `minusInt` ord '0' - | c >= 'a' && c <= 'f' = ord c `minusInt` ord 'a' `plusInt` ten - | c >= 'A' && c <= 'F' = ord c `minusInt` ord 'A' `plusInt` ten - | otherwise = error ("Char.digitToInt: not a digit " ++ show c) -- sigh - ten = I# 10# showSignedInt :: Int -> Int -> ShowS @@ -268,130 +266,3 @@ itos n# cs | otherwise = case chr# (ord# '0'# +# (n# `remInt#` 10#)) of { c# -> itos' (n# `quotInt#` 10#) (C# c# : cs) } \end{code} - - -%********************************************************* -%* * -\subsection{Character stuff} -%* * -%********************************************************* - -\begin{code} -isAscii, isLatin1, isControl, isPrint, isSpace, isUpper, - isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, - isAsciiUpper, isAsciiLower :: Char -> Bool -isAscii c = c < '\x80' -isLatin1 c = c <= '\xff' -isControl c = c < ' ' || c >= '\DEL' && c <= '\x9f' -isPrint c = not (isControl c) - --- isSpace includes non-breaking space --- Done with explicit equalities both for efficiency, and to avoid a tiresome --- recursion with GHC.List elem -isSpace c = c == ' ' || - c == '\t' || - c == '\n' || - c == '\r' || - c == '\f' || - c == '\v' || - c == '\xa0' - --- The upper case ISO characters have the multiplication sign dumped --- randomly in the middle of the range. Go figure. -isUpper c = c >= 'A' && c <= 'Z' || - c >= '\xC0' && c <= '\xD6' || - c >= '\xD8' && c <= '\xDE' --- The lower case ISO characters have the division sign dumped --- randomly in the middle of the range. Go figure. -isLower c = c >= 'a' && c <= 'z' || - c >= '\xDF' && c <= '\xF6' || - c >= '\xF8' && c <= '\xFF' -isAsciiLower c = c >= 'a' && c <= 'z' -isAsciiUpper c = c >= 'A' && c <= 'Z' - -isAlpha c = isLower c || isUpper c -isDigit c = c >= '0' && c <= '9' -isOctDigit c = c >= '0' && c <= '7' -isHexDigit c = isDigit c || c >= 'A' && c <= 'F' || - c >= 'a' && c <= 'f' -isAlphaNum c = isAlpha c || isDigit c - --- Case-changing operations - -toUpper, toLower :: Char -> Char -toUpper c@(C# c#) - | isAsciiLower c = C# (chr# (ord# c# -# 32#)) - | isAscii c = c - -- fall-through to the slower stuff. - | isLower c && c /= '\xDF' && c /= '\xFF' - = unsafeChr (ord c `minusInt` ord 'a' `plusInt` ord 'A') - | otherwise - = c - - -toLower c@(C# c#) - | isAsciiUpper c = C# (chr# (ord# c# +# 32#)) - | isAscii c = c - | isUpper c = unsafeChr (ord c `minusInt` ord 'A' `plusInt` ord 'a') - | otherwise = c - -asciiTab :: [String] -asciiTab = -- Using an array drags in the array module. listArray ('\NUL', ' ') - ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", - "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", - "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", - "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", - "SP"] -\end{code} - -%********************************************************* -%* * -\subsection{Functions on strings} -%* * -%********************************************************* - -lines breaks a string up into a list of strings at newline characters. -The resulting strings do not contain newlines. Similary, words -breaks a string up into a list of words, which were delimited by -white space. unlines and unwords are the inverse operations. -unlines joins lines with terminating newlines, and unwords joins -words with separating spaces. - -\begin{code} -lines :: String -> [String] -lines "" = [] -lines s = let (l, s') = break (== '\n') s - in l : case s' of - [] -> [] - (_:s'') -> lines s'' - -words :: String -> [String] -words s = case dropWhile {-partain:Char.-}isSpace s of - "" -> [] - s' -> w : words s'' - where (w, s'') = - break {-partain:Char.-}isSpace s' - -unlines :: [String] -> String -#ifdef USE_REPORT_PRELUDE -unlines = concatMap (++ "\n") -#else --- HBC version (stolen) --- here's a more efficient version -unlines [] = [] -unlines (l:ls) = l ++ '\n' : unlines ls -#endif - -unwords :: [String] -> String -#ifdef USE_REPORT_PRELUDE -unwords [] = "" -unwords ws = foldr1 (\w s -> w ++ ' ':s) ws -#else --- HBC version (stolen) --- here's a more efficient version -unwords [] = "" -unwords [w] = w -unwords (w:ws) = w ++ ' ' : unwords ws -#endif - -\end{code} diff --git a/Text/ParserCombinators/ReadP.hs b/Text/ParserCombinators/ReadP.hs index 8a9cc80..ab84419 100644 --- a/Text/ParserCombinators/ReadP.hs +++ b/Text/ParserCombinators/ReadP.hs @@ -51,7 +51,7 @@ module Text.ParserCombinators.ReadP import Control.Monad( MonadPlus(..) ) #ifdef __GLASGOW_HASKELL__ -import GHC.Show( isSpace ) +import {-# SOURCE #-} GHC.Unicode ( isSpace ) import GHC.Base #else import Data.Char( isSpace ) diff --git a/Text/Read/Lex.hs b/Text/Read/Lex.hs index 9be4220..ff63aa1 100644 --- a/Text/Read/Lex.hs +++ b/Text/Read/Lex.hs @@ -34,7 +34,8 @@ import Text.ParserCombinators.ReadP #ifdef __GLASGOW_HASKELL__ import GHC.Base import GHC.Num( Num(..), Integer ) -import GHC.Show( Show(.. ), isSpace, isAlpha, isAlphaNum ) +import GHC.Show( Show(..) ) +import {-# SOURCE #-} GHC.Unicode ( isSpace, isAlpha, isAlphaNum ) import GHC.Real( Ratio(..), Integral, Rational, (%), fromIntegral, toInteger, (^), (^^), infinity, notANumber ) import GHC.List diff --git a/include/HsBase.h b/include/HsBase.h index d124107..51f2e60 100644 --- a/include/HsBase.h +++ b/include/HsBase.h @@ -1,5 +1,5 @@ /* ----------------------------------------------------------------------------- - * $Id: HsBase.h,v 1.22 2003/06/12 16:06:07 simonmar Exp $ + * $Id: HsBase.h,v 1.23 2003/08/19 16:39:14 simonmar Exp $ * * (c) The University of Glasgow 2001-2002 * @@ -78,6 +78,9 @@ #ifdef HAVE_LIMITS_H #include #endif +#ifdef HAVE_WCTYPE_H +#include +#endif #if !defined(mingw32_TARGET_OS) && !defined(irix_TARGET_OS) # if defined(HAVE_SYS_RESOURCE_H) -- 1.7.10.4