codepages/MakeTable.hs

   1 {--
   2 This is a script to generate the necessary tables to support Windows code page
   3 encoding/decoding.
   4
   5 License: see libraries/base/LICENSE
   6
   7 The code page tables are available from :
   8 http://www.unicode.org/Public/MAPPINGS/
   9
  10 To run this script, use e.g.
  11 runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
  12
  13 Currently, this script only supports single-byte encodings, since the lookup
  14 tables required for the CJK double-byte codepages are too large to be
  15 statically linked into every executable.  We plan to add support for them once
  16 GHC is able to produce Windows DLLs.
  17
  18 --}
  19
  20 module Main where
  21
  22 import System.FilePath
  23 import qualified Data.Map as Map
  24 import System.IO
  25 import Data.Maybe (mapMaybe)
  26 import Data.List (intersperse)
  27 import Data.Word
  28 import Numeric
  29 import Control.Monad.State
  30 import System.Environment
  31 import Control.Exception(evaluate)
  32
  33 main :: IO ()
  34 main = do
  35     moduleName:outFile:files <- getArgs
  36     sbes <- mapM readMapAndIx files
  37     withBinaryFile outFile WriteMode $ flip hPutStr
  38         $ unlines $ makeTableFile moduleName files sbes
  39   where
  40     readMapAndIx f = do
  41         m <- readMap f
  42         return (codePageNum f, m)
  43
  44 -- filenames are assumed to be of the form "CP1250.TXT"
  45 codePageNum :: FilePath -> Int
  46 codePageNum = read . drop 2 . takeBaseName
  47
  48 readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
  49 readMap f  = withBinaryFile f ReadMode $ \h -> do
  50     contents <- hGetContents h
  51     let ms = Map.fromList $ mapMaybe parseLine $ lines contents
  52     evaluate $ Map.size ms
  53     return ms
  54
  55 parseLine :: Enum a => String -> Maybe (a,Char)
  56 parseLine s = case words s of
  57     ('#':_):_           -> Nothing
  58     bs:"#DBCS":_        -> Just (readHex' bs, toEnum 0xDC00)
  59     bs:"#UNDEFINED":_   -> Just (readHex' bs, toEnum 0)
  60     bs:cs:('#':_):_     -> Just (readHex' bs, readCharHex cs)
  61     _                   -> Nothing
  62
  63 readHex' :: Enum a => String -> a
  64 readHex' ('0':'x':s) = case readHex s of
  65     [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
  66     _ -> error $ "Can't read hex: " ++ show s
  67 readHex' s = error $ "Can't read hex: " ++ show s
  68
  69 readCharHex :: String -> Char
  70 readCharHex s = if c > fromEnum (maxBound :: Word16)
  71                     then error "Can't handle non-BMP character."
  72                     else toEnum c
  73     where c = readHex' s
  74
  75
  76 -------------------------------------------
  77 -- Writing out the main data values.
  78
  79 makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
  80 makeTableFile moduleName files maps = concat
  81     [ languageDirectives, firstComment files, header,
  82         theImports, theTypes, blockSizeText, tablePart]
  83   where
  84     header = [ "module " ++ moduleName ++ " where"
  85              , ""
  86              ]
  87     tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
  88                 , "codePageMap = ["
  89                 ] ++ (intersperse "\n    ," $ map mkTableEntry maps)
  90                 ++ ["    ]"]
  91     mkTableEntry (i,m) = "    (" ++ show i ++ ", " ++ makeSBE m ++ "    )"
  92     blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
  93
  94
  95 makeSBE :: Map.Map Word8 Char -> String
  96 makeSBE m = unlines
  97                 [ "SingleByteCP {"
  98                 , "     decoderArray = " ++ mkConvArray es
  99                 , "     , encoderArray = " ++ mkCompactArray (swapMap m)
 100                 , "   }"
 101                 ]
 102   where
 103     es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
 104
 105 swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
 106 swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
 107   where
 108     swap (x,y) = (y,x)
 109
 110
 111 mkConvArray :: Embed a => [a] -> String
 112 mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
 113
 114
 115 -------------------------------------------
 116 -- Compact arrays
 117 --
 118 -- The decoding map (from Word8 to Char) can be implemented with a simple array
 119 -- of 256 Word16's.  Bytes which do not belong to the code page are mapped to
 120 -- '\0'.
 121 --
 122 -- However, a naive table mapping Char to Word8 would require 2^16 Word8's.  We
 123 -- can use much less space with the right data structure, since at most 256 of
 124 -- those entries are nonzero.
 125 --
 126 -- We use "compact arrays", as described in "Unicode Demystified" by Richard
 127 -- Gillam.
 128 --
 129 -- Fix a block size S which is a power of two.  We compress an array of N
 130 -- entries (where N>>S) as follows.  First, split the array into blocks of size
 131 -- S, then remove all repeate blocks to form the "value" array.  Then construct
 132 -- a separate "index" array which maps the position of blocks in the old array
 133 -- to a position in the value array.
 134 --
 135 -- For example, assume that S=32 we have six blocks ABABCA, each with 32
 136 -- elements.
 137 --
 138 -- Then the compressed table consists of two arrays:
 139 -- 1) An array "values", concatenating the unique blocks ABC
 140 -- 2) An array "indices" which equals [0,1,0,1,2,0].
 141 --
 142 -- To look up '\100', first calculate divMod 100 32 = (3,4).  Since
 143 -- indices[3]=1, we look at the second unique block B; thus the encoded byte is
 144 -- B[4].
 145 --
 146 -- The upshot of this representation is that the lookup is very quick as it only
 147 -- requires two array accesses plus some bit masking/shifting.
 148
 149 -- From testing, this is an optimal size.
 150 blockBitSize :: Int
 151 blockBitSize = 6
 152
 153 mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
 154 mkCompactArray m = unlines [
 155             ""
 156             , " CompactArray {"
 157             , "        encoderIndices = " ++ mkConvArray is'
 158             , "        , encoderValues = "
 159                     ++ mkConvArray (concat $ Map.elems vs)
 160             , "        , encoderMax = " ++ show (fst $ Map.findMax m)
 161             , "        }"
 162             ]
 163   where
 164     blockSize = 2 ^ blockBitSize
 165     (is,(vs,_)) = compress blockSize $ m
 166     is' = map (* blockSize) is
 167
 168 type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
 169 -- each entry in the list corresponds to a block of size n.
 170 compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
 171         -> ([Int], CompressState b)
 172 compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
 173     where
 174         chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
 175                     $ [minBound..fst (Map.findMax ms)]
 176         mkChunks [] = []
 177         mkChunks xs = take n xs : mkChunks (drop n xs)
 178         lookupOrAdd xs = do
 179             (m,rm) <- get
 180             case Map.lookup xs rm of
 181                 Just i -> return i
 182                 Nothing -> do
 183                     let i = if Map.null m
 184                                 then 0
 185                                 else 1 + fst (Map.findMax m)
 186                     put (Map.insert i xs m, Map.insert xs i rm)
 187                     return i
 188
 189 -------------------------------------------
 190 -- Static parts of the generated module.
 191
 192 languageDirectives :: [String]
 193 languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
 194
 195
 196 firstComment :: [FilePath] -> [String]
 197 firstComment files = map ("-- " ++) $
 198     [ "Do not edit this file directly!"
 199     , "It was generated by the MakeTable.hs script using the following files:"
 200     ] ++ map takeFileName files
 201
 202 theImports :: [String]
 203 theImports = map ("import " ++ )
 204     ["GHC.Prim", "GHC.Base", "GHC.Word", "GHC.Num"]
 205
 206 theTypes :: [String]
 207 theTypes = [ "data ConvArray a = ConvArray Addr#"
 208            , "data CompactArray a b = CompactArray {"
 209            , "    encoderMax :: !a,"
 210            , "    encoderIndices :: !(ConvArray Int),"
 211            , "    encoderValues :: !(ConvArray b)"
 212            , "  }"
 213            , ""
 214            , "data CodePageArrays = SingleByteCP {"
 215            , "    decoderArray :: !(ConvArray Char),"
 216            , "    encoderArray :: !(CompactArray Char Word8)"
 217            , "  }"
 218            , ""
 219            ]
 220
 221 -------------------------------------------
 222 -- Embed class and associated functions
 223
 224 class (Ord a, Enum a, Bounded a, Show a) => Embed a where
 225     mkHex :: a -> String
 226
 227 instance Embed Word8 where
 228     mkHex = showHex'
 229
 230 instance Embed Word16 where
 231     mkHex = repDualByte
 232
 233 instance Embed Char where
 234     mkHex = repDualByte
 235
 236 -- this is used for the indices of the compressed array.
 237 instance Embed Int where
 238     mkHex = repDualByte
 239
 240 showHex' :: Integral a => a -> String
 241 showHex' s = "\\x" ++ showHex s ""
 242
 243 repDualByte :: Enum c => c -> String
 244 repDualByte c
 245     | n >= 2^(16::Int) = error "value is too high!"
 246     -- NOTE : this assumes little-endian architecture.  But we're only using this on Windows,
 247     -- so it's probably OK.
 248     | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)
 249   where
 250     n = fromEnum c
 251
 252