Use explicit language extensions & remove extension fields from base.cabal

[ghc-base.git] / GHC / IO / Encoding.hs
diff --git a/GHC/IO/Encoding.hs b/GHC/IO/Encoding.hs

index 78aad98..9f56966 100644 (file)
--- a/GHC/IO/Encoding.hs
+++ b/GHC/IO/Encoding.hs
@@ -1,4 +1,6 @@
-{-# OPTIONS_GHC -fno-implicit-prelude -funbox-strict-fields #-}
+{-# LANGUAGE CPP, NoImplicitPrelude #-}
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
  -----------------------------------------------------------------------------
  -- |
  -- Module      :  GHC.IO.Encoding
@@ -16,7 +18,7 @@
  module GHC.IO.Encoding (
    BufferCodec(..), TextEncoding(..), TextEncoder, TextDecoder,
    latin1, latin1_encode, latin1_decode,
-  utf8, 
+  utf8, utf8_bom,
    utf16, utf16le, utf16be,
    utf32, utf32le, utf32be, 
    localeEncoding,
@@ -30,6 +32,9 @@ import GHC.IO.Encoding.Types
  import GHC.Word
  #if !defined(mingw32_HOST_OS)
  import qualified GHC.IO.Encoding.Iconv  as Iconv
+#else
+import qualified GHC.IO.Encoding.CodePage as CodePage
+import Text.Read (reads)
  #endif
  import qualified GHC.IO.Encoding.Latin1 as Latin1
  import qualified GHC.IO.Encoding.UTF8   as UTF8
@@ -54,6 +59,17 @@ latin1 = Latin1.latin1_checked
  utf8  :: TextEncoding
  utf8 = UTF8.utf8
  
+-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
+-- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',
+-- except that on input, the BOM sequence is ignored at the beginning
+-- of the stream, and on output, the BOM sequence is prepended.
+--
+-- The byte-order-mark is strictly unnecessary in UTF-8, but is
+-- sometimes used to identify the encoding of a file.
+--
+utf8_bom  :: TextEncoding
+utf8_bom = UTF8.utf8_bom
+
  -- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
  -- indicate endianness).
  utf16  :: TextEncoding
@@ -85,14 +101,34 @@ localeEncoding  :: TextEncoding
  #if !defined(mingw32_HOST_OS)
  localeEncoding = Iconv.localeEncoding
  #else
-localeEncoding = Latin1.latin1
+localeEncoding = CodePage.localeEncoding
  #endif
  
  -- | Look up the named Unicode encoding.  May fail with 
  --
  --  * 'isDoesNotExistError' if the encoding is unknown
  --
--- The set of known encodings is system-dependent.
+-- The set of known encodings is system-dependent, but includes at least:
+--
+--  * @UTF-8@
+--
+--  * @UTF-16@, @UTF-16BE@, @UTF-16LE@
+--
+--  * @UTF-32@, @UTF-32BE@, @UTF-32LE@
+--
+-- On systems using GNU iconv (e.g. Linux), there is additional
+-- notation for specifying how illegal characters are handled:
+--
+--  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause 
+--    all illegal sequences on input to be ignored, and on output
+--    will drop all code points that have no representation in the
+--    target encoding.
+--
+--  * a suffix of @\/\/TRANSLIT@ will choose a replacement character
+--    for illegal sequences or code points.
+--
+-- On Windows, you can access supported code pages with the prefix
+-- @CP@; for example, @\"CP1250\"@.
  --
  mkTextEncoding :: String -> IO TextEncoding
  #if !defined(mingw32_HOST_OS)
@@ -105,6 +141,8 @@ mkTextEncoding "UTF-16BE" = return utf16be
  mkTextEncoding "UTF-32"   = return utf32
  mkTextEncoding "UTF-32LE" = return utf32le
  mkTextEncoding "UTF-32BE" = return utf32be
+mkTextEncoding ('C':'P':n)
+    | [(cp,"")] <- reads n = return $ CodePage.codePageEncoding cp
  mkTextEncoding e = ioException
       (IOError Nothing NoSuchThing "mkTextEncoding"
            ("unknown encoding:" ++ e)  Nothing Nothing)