codepages/MakeTable.hs

   1 {--
   2 This is a script to generate the necessary tables to support Windows code page
   3 encoding/decoding.
   4
   5 License: see libraries/base/LICENSE
   6
   7 The code page tables are available from :
   8 http://www.unicode.org/Public/MAPPINGS/
   9
  10 To run this script, use e.g.
  11 runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
  12
  13 Currently, this script only supports single-byte encodings, since the lookup
  14 tables required for the CJK double-byte codepages are too large to be
  15 statically linked into every executable.  We plan to add support for them once
  16 GHC is able to produce Windows DLLs.
  17 --}
  18
  19 module Main where
  20
  21 import System.FilePath
  22 import qualified Data.Map as Map
  23 import System.IO
  24 import Data.Maybe (mapMaybe)
  25 import Data.List (intersperse)
  26 import Data.Word
  27 import Numeric
  28 import Control.Monad.State
  29 import System.Environment
  30 import Control.Exception(evaluate)
  31
  32 main :: IO ()
  33 main = do
  34     moduleName:outFile:files <- getArgs
  35     let badFiles = -- These fail with an error like
  36                    --     MakeTable: Enum.toEnum{Word8}: tag (33088) is outside of bounds (0,255)
  37                    -- I have no idea what's going on, so for now we just
  38                    -- skip them.
  39                    ["CPs/CP932.TXT",
  40                     "CPs/CP936.TXT",
  41                     "CPs/CP949.TXT",
  42                     "CPs/CP950.TXT"]
  43     let files' = filter (`notElem` badFiles) files
  44     sbes <- mapM readMapAndIx files'
  45     putStrLn "Writing output"
  46     withBinaryFile outFile WriteMode $ flip hPutStr
  47         $ unlines $ makeTableFile moduleName files' sbes
  48   where
  49     readMapAndIx f = do
  50         putStrLn ("Reading " ++ f)
  51         m <- readMap f
  52         return (codePageNum f, m)
  53
  54 -- filenames are assumed to be of the form "CP1250.TXT"
  55 codePageNum :: FilePath -> Int
  56 codePageNum = read . drop 2 . takeBaseName
  57
  58 readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
  59 readMap f  = withBinaryFile f ReadMode $ \h -> do
  60     contents <- hGetContents h
  61     let ms = Map.fromList $ mapMaybe parseLine $ lines contents
  62     evaluate $ Map.size ms
  63     return ms
  64
  65 parseLine :: Enum a => String -> Maybe (a,Char)
  66 parseLine s = case words s of
  67     ('#':_):_           -> Nothing
  68     bs:"#DBCS":_        -> Just (readHex' bs, toEnum 0xDC00)
  69     bs:"#UNDEFINED":_   -> Just (readHex' bs, toEnum 0)
  70     bs:cs:('#':_):_     -> Just (readHex' bs, readCharHex cs)
  71     _                   -> Nothing
  72
  73 readHex' :: Enum a => String -> a
  74 readHex' ('0':'x':s) = case readHex s of
  75     [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
  76     _ -> error $ "Can't read hex: " ++ show s
  77 readHex' s = error $ "Can't read hex: " ++ show s
  78
  79 readCharHex :: String -> Char
  80 readCharHex s = if c > fromEnum (maxBound :: Word16)
  81                     then error "Can't handle non-BMP character."
  82                     else toEnum c
  83     where c = readHex' s
  84
  85
  86 -------------------------------------------
  87 -- Writing out the main data values.
  88
  89 makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
  90 makeTableFile moduleName files maps = concat
  91     [ languageDirectives, firstComment files, header,
  92         theImports, theTypes, blockSizeText, tablePart]
  93   where
  94     header = [ "module " ++ moduleName ++ " where"
  95              , ""
  96              ]
  97     tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
  98                 , "codePageMap = ["
  99                 ] ++ (intersperse "\n    ," $ map mkTableEntry maps)
 100                 ++ ["    ]"]
 101     mkTableEntry (i,m) = "    (" ++ show i ++ ", " ++ makeSBE m ++ "    )"
 102     blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
 103
 104
 105 makeSBE :: Map.Map Word8 Char -> String
 106 makeSBE m = unlines
 107                 [ "SingleByteCP {"
 108                 , "     decoderArray = " ++ mkConvArray es
 109                 , "     , encoderArray = " ++ mkCompactArray (swapMap m)
 110                 , "   }"
 111                 ]
 112   where
 113     es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
 114
 115 swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
 116 swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
 117   where
 118     swap (x,y) = (y,x)
 119
 120
 121 mkConvArray :: Embed a => [a] -> String
 122 mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
 123
 124
 125 -------------------------------------------
 126 -- Compact arrays
 127 --
 128 -- The decoding map (from Word8 to Char) can be implemented with a simple array
 129 -- of 256 Word16's.  Bytes which do not belong to the code page are mapped to
 130 -- '\0'.
 131 --
 132 -- However, a naive table mapping Char to Word8 would require 2^16 Word8's.  We
 133 -- can use much less space with the right data structure, since at most 256 of
 134 -- those entries are nonzero.
 135 --
 136 -- We use "compact arrays", as described in "Unicode Demystified" by Richard
 137 -- Gillam.
 138 --
 139 -- Fix a block size S which is a power of two.  We compress an array of N
 140 -- entries (where N>>S) as follows.  First, split the array into blocks of size
 141 -- S, then remove all repeate blocks to form the "value" array.  Then construct
 142 -- a separate "index" array which maps the position of blocks in the old array
 143 -- to a position in the value array.
 144 --
 145 -- For example, assume that S=32 we have six blocks ABABCA, each with 32
 146 -- elements.
 147 --
 148 -- Then the compressed table consists of two arrays:
 149 -- 1) An array "values", concatenating the unique blocks ABC
 150 -- 2) An array "indices" which equals [0,1,0,1,2,0].
 151 --
 152 -- To look up '\100', first calculate divMod 100 32 = (3,4).  Since
 153 -- indices[3]=1, we look at the second unique block B; thus the encoded byte is
 154 -- B[4].
 155 --
 156 -- The upshot of this representation is that the lookup is very quick as it only
 157 -- requires two array accesses plus some bit masking/shifting.
 158
 159 -- From testing, this is an optimal size.
 160 blockBitSize :: Int
 161 blockBitSize = 6
 162
 163 mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
 164 mkCompactArray m = unlines [
 165             ""
 166             , " CompactArray {"
 167             , "        encoderIndices = " ++ mkConvArray is'
 168             , "        , encoderValues = "
 169                     ++ mkConvArray (concat $ Map.elems vs)
 170             , "        , encoderMax = " ++ show (fst $ Map.findMax m)
 171             , "        }"
 172             ]
 173   where
 174     blockSize = 2 ^ blockBitSize
 175     (is,(vs,_)) = compress blockSize $ m
 176     is' = map (* blockSize) is
 177
 178 type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
 179 -- each entry in the list corresponds to a block of size n.
 180 compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
 181         -> ([Int], CompressState b)
 182 compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
 183     where
 184         chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
 185                     $ [minBound..fst (Map.findMax ms)]
 186         mkChunks [] = []
 187         mkChunks xs = take n xs : mkChunks (drop n xs)
 188         lookupOrAdd xs = do
 189             (m,rm) <- get
 190             case Map.lookup xs rm of
 191                 Just i -> return i
 192                 Nothing -> do
 193                     let i = if Map.null m
 194                                 then 0
 195                                 else 1 + fst (Map.findMax m)
 196                     put (Map.insert i xs m, Map.insert xs i rm)
 197                     return i
 198
 199 -------------------------------------------
 200 -- Static parts of the generated module.
 201
 202 languageDirectives :: [String]
 203 languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
 204
 205
 206 firstComment :: [FilePath] -> [String]
 207 firstComment files = map ("-- " ++) $
 208     [ "Do not edit this file directly!"
 209     , "It was generated by the MakeTable.hs script using the files below."
 210     , "To regenerate it, run \"make\" in ../../../../codepages/"
 211     , ""
 212     , "Files:"
 213     ] ++ map takeFileName files
 214
 215 theImports :: [String]
 216 theImports = map ("import " ++ )
 217     ["GHC.Prim", "GHC.Base", "GHC.Word"]
 218
 219 theTypes :: [String]
 220 theTypes = [ "data ConvArray a = ConvArray Addr#"
 221            , "data CompactArray a b = CompactArray {"
 222            , "    encoderMax :: !a,"
 223            , "    encoderIndices :: !(ConvArray Int),"
 224            , "    encoderValues :: !(ConvArray b)"
 225            , "  }"
 226            , ""
 227            , "data CodePageArrays = SingleByteCP {"
 228            , "    decoderArray :: !(ConvArray Char),"
 229            , "    encoderArray :: !(CompactArray Char Word8)"
 230            , "  }"
 231            , ""
 232            ]
 233
 234 -------------------------------------------
 235 -- Embed class and associated functions
 236
 237 class (Ord a, Enum a, Bounded a, Show a) => Embed a where
 238     mkHex :: a -> String
 239
 240 instance Embed Word8 where
 241     mkHex = showHex'
 242
 243 instance Embed Word16 where
 244     mkHex = repDualByte
 245
 246 instance Embed Char where
 247     mkHex = repDualByte
 248
 249 -- this is used for the indices of the compressed array.
 250 instance Embed Int where
 251     mkHex = repDualByte
 252
 253 showHex' :: Integral a => a -> String
 254 showHex' s = "\\x" ++ showHex s ""
 255
 256 repDualByte :: Enum c => c -> String
 257 repDualByte c
 258     | n >= 2^(16::Int) = error "value is too high!"
 259     -- NOTE : this assumes little-endian architecture.  But we're only using this on Windows,
 260     -- so it's probably OK.
 261     | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
 262   where
 263     n = fromEnum c
 264
 265