Add support for Data.Char.generalCategory to libcompat

author Simon Marlow <simonmar@microsoft.com>

Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)

committer Simon Marlow <simonmar@microsoft.com>

Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)
author Simon Marlow <simonmar@microsoft.com>
Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)
committer Simon Marlow <simonmar@microsoft.com>
Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)
diff --git a/ghc/compiler/parser/Lexer.x b/ghc/compiler/parser/Lexer.x

index 90fbf7a..6193c76 100644 (file)
--- a/ghc/compiler/parser/Lexer.x
+++ b/ghc/compiler/parser/Lexer.x
@@ -47,6 +47,12 @@ import DATA_BITS
  import Data.Char
  import Ratio
  --import TRACE
+
+#if __GLASGOW_HASKELL__ >= 605
+import Data.Char       ( GeneralCategory(..), generalCategory )
+#else
+import Compat.Unicode  ( GeneralCategory(..), generalCategory )
+#endif
  }
  
  $unispace    = \x05
@@ -1182,9 +1188,6 @@ alexGetChar (AI loc ofs s)
         other_graphic   = '\x6'
  
         adj_c 
-#if __GLASGOW_HASKELL__ < 605
-         = c  -- no Unicode support
-#else
           | c <= '\x06' = non_graphic
           | c <= '\xff' = c
           | otherwise = 
@@ -1213,7 +1216,6 @@ alexGetChar (AI loc ofs s)
                   OtherSymbol           -> symbol
                   Space                 -> space
                   _other                -> non_graphic
-#endif
  
  -- This version does not squash unicode characters, it is used when
  -- lexing strings.
diff --git a/ghc/lib/compat/Compat/Unicode.hs b/ghc/lib/compat/Compat/Unicode.hs

new file mode 100644 (file)

index 0000000..4765511
--- /dev/null
+++ b/ghc/lib/compat/Compat/Unicode.hs
@@ -0,0 +1,57 @@
+{-# OPTIONS -cpp #-}
+module Compat.Unicode (
+    GeneralCategory(..), generalCategory,
+  ) where
+
+#if __GLASGOW_HASKELL__ > 604
+
+import Data.Char (GeneralCategory(..), generalCategory)
+
+#else
+
+import Foreign.C       ( CInt )
+import Data.Char       ( ord )
+
+-- | Unicode General Categories (column 2 of the UnicodeData table)
+-- in the order they are listed in the Unicode standard.
+
+data GeneralCategory
+        = UppercaseLetter       -- Lu  Letter, Uppercase
+        | LowercaseLetter       -- Ll  Letter, Lowercase
+        | TitlecaseLetter       -- Lt  Letter, Titlecase
+        | ModifierLetter        -- Lm  Letter, Modifier
+        | OtherLetter           -- Lo  Letter, Other
+        | NonSpacingMark        -- Mn  Mark, Non-Spacing
+        | SpacingCombiningMark  -- Mc  Mark, Spacing Combining
+        | EnclosingMark         -- Me  Mark, Enclosing
+        | DecimalNumber         -- Nd  Number, Decimal
+        | LetterNumber          -- Nl  Number, Letter
+        | OtherNumber           -- No  Number, Other
+        | ConnectorPunctuation  -- Pc  Punctuation, Connector
+        | DashPunctuation       -- Pd  Punctuation, Dash
+        | OpenPunctuation       -- Ps  Punctuation, Open
+        | ClosePunctuation      -- Pe  Punctuation, Close
+        | InitialQuote          -- Pi  Punctuation, Initial quote
+        | FinalQuote            -- Pf  Punctuation, Final quote
+        | OtherPunctuation      -- Po  Punctuation, Other
+        | MathSymbol            -- Sm  Symbol, Math
+        | CurrencySymbol        -- Sc  Symbol, Currency
+        | ModifierSymbol        -- Sk  Symbol, Modifier
+        | OtherSymbol           -- So  Symbol, Other
+        | Space                 -- Zs  Separator, Space
+        | LineSeparator         -- Zl  Separator, Line
+        | ParagraphSeparator    -- Zp  Separator, Paragraph
+        | Control               -- Cc  Other, Control
+        | Format                -- Cf  Other, Format
+        | Surrogate             -- Cs  Other, Surrogate
+        | PrivateUse            -- Co  Other, Private Use
+        | NotAssigned           -- Cn  Other, Not Assigned
+        deriving (Eq, Ord, Enum, Read, Show, Bounded)
+
+-- | Retrieves the general Unicode category of the character.
+generalCategory :: Char -> GeneralCategory
+generalCategory c = toEnum (wgencat (fromIntegral (ord c)))
+
+foreign import ccall unsafe "u_gencat"
+  wgencat :: CInt -> Int
+#endif
diff --git a/ghc/lib/compat/Makefile b/ghc/lib/compat/Makefile

index 06c6103..ae2f4ed 100644 (file)
--- a/ghc/lib/compat/Makefile
+++ b/ghc/lib/compat/Makefile
@@ -68,6 +68,9 @@ Distribution/ParseUtils.$(way_)o :  $(FPTOOLS_TOP)/libraries/Cabal/Distribution/
  Distribution/Compiler.$(way_)o :  $(FPTOOLS_TOP)/libraries/Cabal/Distribution/Compiler.hs
  Distribution/Version.$(way_)o :  $(FPTOOLS_TOP)/libraries/Cabal/Distribution/Version.hs
  Language/Haskell/Extension.$(way_)o :  $(FPTOOLS_TOP)/libraries/Cabal/Language/Haskell/Extension.hs
+cbits/unicode.o : $(FPTOOLS_TOP)/libraries/base/cbits/WCsubst.c $(FPTOOLS_TOP)/libraries/base/include/WCsubst.h
+
+SRC_CC_OPTS += -I$(FPTOOLS_TOP)/libraries/base/cbits -I$(FPTOOLS_TOP)/libraries/base/include
  
  # Make the #includes in the stubs independent of the current location
  SRC_HC_OPTS += -I$(FPTOOLS_TOP)/libraries
diff --git a/ghc/lib/compat/cbits/unicode.c b/ghc/lib/compat/cbits/unicode.c

new file mode 100644 (file)

index 0000000..0e0d1c5
--- /dev/null
+++ b/ghc/lib/compat/cbits/unicode.c
@@ -0,0 +1,3 @@
+#if __GLASGOW_HASKELL__ < 604
+#include "WCsubst.c"
+#endif
author	Simon Marlow <simonmar@microsoft.com>
	Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)
committer	Simon Marlow <simonmar@microsoft.com>
	Wed, 1 Mar 2006 11:35:36 +0000 (11:35 +0000)
ghc/compiler/parser/Lexer.x		patch \| blob \| history
ghc/lib/compat/Compat/Unicode.hs	[new file with mode: 0644]	patch \| blob
ghc/lib/compat/Makefile		patch \| blob \| history
ghc/lib/compat/cbits/unicode.c	[new file with mode: 0644]	patch \| blob