From 8e59ba46e26979cc11fa71e3f67aebbe6da4e8d6 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Wed, 1 Mar 2006 11:35:36 +0000 Subject: [PATCH] Add support for Data.Char.generalCategory to libcompat this is so that the stage1 compiler has proper support for Unicode. Should fix these errors: lexical error in string/character literal at character '\8759' when building the stage2 compiler. --- ghc/compiler/parser/Lexer.x | 10 ++++--- ghc/lib/compat/Compat/Unicode.hs | 57 ++++++++++++++++++++++++++++++++++++++ ghc/lib/compat/Makefile | 3 ++ ghc/lib/compat/cbits/unicode.c | 3 ++ 4 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 ghc/lib/compat/Compat/Unicode.hs create mode 100644 ghc/lib/compat/cbits/unicode.c diff --git a/ghc/compiler/parser/Lexer.x b/ghc/compiler/parser/Lexer.x index 90fbf7a..6193c76 100644 --- a/ghc/compiler/parser/Lexer.x +++ b/ghc/compiler/parser/Lexer.x @@ -47,6 +47,12 @@ import DATA_BITS import Data.Char import Ratio --import TRACE + +#if __GLASGOW_HASKELL__ >= 605 +import Data.Char ( GeneralCategory(..), generalCategory ) +#else +import Compat.Unicode ( GeneralCategory(..), generalCategory ) +#endif } $unispace = \x05 @@ -1182,9 +1188,6 @@ alexGetChar (AI loc ofs s) other_graphic = '\x6' adj_c -#if __GLASGOW_HASKELL__ < 605 - = c -- no Unicode support -#else | c <= '\x06' = non_graphic | c <= '\xff' = c | otherwise = @@ -1213,7 +1216,6 @@ alexGetChar (AI loc ofs s) OtherSymbol -> symbol Space -> space _other -> non_graphic -#endif -- This version does not squash unicode characters, it is used when -- lexing strings. diff --git a/ghc/lib/compat/Compat/Unicode.hs b/ghc/lib/compat/Compat/Unicode.hs new file mode 100644 index 0000000..4765511 --- /dev/null +++ b/ghc/lib/compat/Compat/Unicode.hs @@ -0,0 +1,57 @@ +{-# OPTIONS -cpp #-} +module Compat.Unicode ( + GeneralCategory(..), generalCategory, + ) where + +#if __GLASGOW_HASKELL__ > 604 + +import Data.Char (GeneralCategory(..), generalCategory) + +#else + +import Foreign.C ( CInt ) +import Data.Char ( ord ) + +-- | Unicode General Categories (column 2 of the UnicodeData table) +-- in the order they are listed in the Unicode standard. + +data GeneralCategory + = UppercaseLetter -- Lu Letter, Uppercase + | LowercaseLetter -- Ll Letter, Lowercase + | TitlecaseLetter -- Lt Letter, Titlecase + | ModifierLetter -- Lm Letter, Modifier + | OtherLetter -- Lo Letter, Other + | NonSpacingMark -- Mn Mark, Non-Spacing + | SpacingCombiningMark -- Mc Mark, Spacing Combining + | EnclosingMark -- Me Mark, Enclosing + | DecimalNumber -- Nd Number, Decimal + | LetterNumber -- Nl Number, Letter + | OtherNumber -- No Number, Other + | ConnectorPunctuation -- Pc Punctuation, Connector + | DashPunctuation -- Pd Punctuation, Dash + | OpenPunctuation -- Ps Punctuation, Open + | ClosePunctuation -- Pe Punctuation, Close + | InitialQuote -- Pi Punctuation, Initial quote + | FinalQuote -- Pf Punctuation, Final quote + | OtherPunctuation -- Po Punctuation, Other + | MathSymbol -- Sm Symbol, Math + | CurrencySymbol -- Sc Symbol, Currency + | ModifierSymbol -- Sk Symbol, Modifier + | OtherSymbol -- So Symbol, Other + | Space -- Zs Separator, Space + | LineSeparator -- Zl Separator, Line + | ParagraphSeparator -- Zp Separator, Paragraph + | Control -- Cc Other, Control + | Format -- Cf Other, Format + | Surrogate -- Cs Other, Surrogate + | PrivateUse -- Co Other, Private Use + | NotAssigned -- Cn Other, Not Assigned + deriving (Eq, Ord, Enum, Read, Show, Bounded) + +-- | Retrieves the general Unicode category of the character. +generalCategory :: Char -> GeneralCategory +generalCategory c = toEnum (wgencat (fromIntegral (ord c))) + +foreign import ccall unsafe "u_gencat" + wgencat :: CInt -> Int +#endif diff --git a/ghc/lib/compat/Makefile b/ghc/lib/compat/Makefile index 06c6103..ae2f4ed 100644 --- a/ghc/lib/compat/Makefile +++ b/ghc/lib/compat/Makefile @@ -68,6 +68,9 @@ Distribution/ParseUtils.$(way_)o : $(FPTOOLS_TOP)/libraries/Cabal/Distribution/ Distribution/Compiler.$(way_)o : $(FPTOOLS_TOP)/libraries/Cabal/Distribution/Compiler.hs Distribution/Version.$(way_)o : $(FPTOOLS_TOP)/libraries/Cabal/Distribution/Version.hs Language/Haskell/Extension.$(way_)o : $(FPTOOLS_TOP)/libraries/Cabal/Language/Haskell/Extension.hs +cbits/unicode.o : $(FPTOOLS_TOP)/libraries/base/cbits/WCsubst.c $(FPTOOLS_TOP)/libraries/base/include/WCsubst.h + +SRC_CC_OPTS += -I$(FPTOOLS_TOP)/libraries/base/cbits -I$(FPTOOLS_TOP)/libraries/base/include # Make the #includes in the stubs independent of the current location SRC_HC_OPTS += -I$(FPTOOLS_TOP)/libraries diff --git a/ghc/lib/compat/cbits/unicode.c b/ghc/lib/compat/cbits/unicode.c new file mode 100644 index 0000000..0e0d1c5 --- /dev/null +++ b/ghc/lib/compat/cbits/unicode.c @@ -0,0 +1,3 @@ +#if __GLASGOW_HASKELL__ < 604 +#include "WCsubst.c" +#endif -- 1.7.10.4