2 This is a script to generate the necessary tables to support Windows code page
5 License: see libraries/base/LICENSE
7 The code page tables are available from :
8 http://www.unicode.org/Public/MAPPINGS/
10 To run this script, use e.g.
11 runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
13 Currently, this script only supports single-byte encodings, since the lookup
14 tables required for the CJK double-byte codepages are too large to be
15 statically linked into every executable. We plan to add support for them once
16 GHC is able to produce Windows DLLs.
22 import System.FilePath
23 import qualified Data.Map as Map
25 import Data.Maybe (mapMaybe)
26 import Data.List (intersperse)
29 import Control.Monad.State
30 import System.Environment
31 import Control.Exception(evaluate)
35 moduleName:outFile:files <- getArgs
36 sbes <- mapM readMapAndIx files
37 withBinaryFile outFile WriteMode $ flip hPutStr
38 $ unlines $ makeTableFile moduleName files sbes
42 return (codePageNum f, m)
44 -- filenames are assumed to be of the form "CP1250.TXT"
45 codePageNum :: FilePath -> Int
46 codePageNum = read . drop 2 . takeBaseName
48 readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
49 readMap f = withBinaryFile f ReadMode $ \h -> do
50 contents <- hGetContents h
51 let ms = Map.fromList $ mapMaybe parseLine $ lines contents
52 evaluate $ Map.size ms
55 parseLine :: Enum a => String -> Maybe (a,Char)
56 parseLine s = case words s of
58 bs:"#DBCS":_ -> Just (readHex' bs, toEnum 0xDC00)
59 bs:"#UNDEFINED":_ -> Just (readHex' bs, toEnum 0)
60 bs:cs:('#':_):_ -> Just (readHex' bs, readCharHex cs)
63 readHex' :: Enum a => String -> a
64 readHex' ('0':'x':s) = case readHex s of
65 [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
66 _ -> error $ "Can't read hex: " ++ show s
67 readHex' s = error $ "Can't read hex: " ++ show s
69 readCharHex :: String -> Char
70 readCharHex s = if c > fromEnum (maxBound :: Word16)
71 then error "Can't handle non-BMP character."
76 -------------------------------------------
77 -- Writing out the main data values.
79 makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
80 makeTableFile moduleName files maps = concat
81 [ languageDirectives, firstComment files, header,
82 theImports, theTypes, blockSizeText, tablePart]
84 header = [ "module " ++ moduleName ++ " where"
87 tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
89 ] ++ (intersperse "\n ," $ map mkTableEntry maps)
91 mkTableEntry (i,m) = " (" ++ show i ++ ", " ++ makeSBE m ++ " )"
92 blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
95 makeSBE :: Map.Map Word8 Char -> String
98 , " decoderArray = " ++ mkConvArray es
99 , " , encoderArray = " ++ mkCompactArray (swapMap m)
103 es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
105 swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
106 swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
111 mkConvArray :: Embed a => [a] -> String
112 mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
115 -------------------------------------------
118 -- The decoding map (from Word8 to Char) can be implemented with a simple array
119 -- of 256 Word16's. Bytes which do not belong to the code page are mapped to
122 -- However, a naive table mapping Char to Word8 would require 2^16 Word8's. We
123 -- can use much less space with the right data structure, since at most 256 of
124 -- those entries are nonzero.
126 -- We use "compact arrays", as described in "Unicode Demystified" by Richard
129 -- Fix a block size S which is a power of two. We compress an array of N
130 -- entries (where N>>S) as follows. First, split the array into blocks of size
131 -- S, then remove all repeate blocks to form the "value" array. Then construct
132 -- a separate "index" array which maps the position of blocks in the old array
133 -- to a position in the value array.
135 -- For example, assume that S=32 we have six blocks ABABCA, each with 32
138 -- Then the compressed table consists of two arrays:
139 -- 1) An array "values", concatenating the unique blocks ABC
140 -- 2) An array "indices" which equals [0,1,0,1,2,0].
142 -- To look up '\100', first calculate divMod 100 32 = (3,4). Since
143 -- indices[3]=1, we look at the second unique block B; thus the encoded byte is
146 -- The upshot of this representation is that the lookup is very quick as it only
147 -- requires two array accesses plus some bit masking/shifting.
149 -- From testing, this is an optimal size.
153 mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
154 mkCompactArray m = unlines [
157 , " encoderIndices = " ++ mkConvArray is'
158 , " , encoderValues = "
159 ++ mkConvArray (concat $ Map.elems vs)
160 , " , encoderMax = " ++ show (fst $ Map.findMax m)
164 blockSize = 2 ^ blockBitSize
165 (is,(vs,_)) = compress blockSize $ m
166 is' = map (* blockSize) is
168 type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
169 -- each entry in the list corresponds to a block of size n.
170 compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
171 -> ([Int], CompressState b)
172 compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
174 chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
175 $ [minBound..fst (Map.findMax ms)]
177 mkChunks xs = take n xs : mkChunks (drop n xs)
180 case Map.lookup xs rm of
183 let i = if Map.null m
185 else 1 + fst (Map.findMax m)
186 put (Map.insert i xs m, Map.insert xs i rm)
189 -------------------------------------------
190 -- Static parts of the generated module.
192 languageDirectives :: [String]
193 languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
196 firstComment :: [FilePath] -> [String]
197 firstComment files = map ("-- " ++) $
198 [ "Do not edit this file directly!"
199 , "It was generated by the MakeTable.hs script using the following files:"
200 ] ++ map takeFileName files
202 theImports :: [String]
203 theImports = map ("import " ++ )
204 ["GHC.Prim", "GHC.Base", "GHC.Word", "GHC.Num"]
207 theTypes = [ "data ConvArray a = ConvArray Addr#"
208 , "data CompactArray a b = CompactArray {"
209 , " encoderMax :: !a,"
210 , " encoderIndices :: !(ConvArray Int),"
211 , " encoderValues :: !(ConvArray b)"
214 , "data CodePageArrays = SingleByteCP {"
215 , " decoderArray :: !(ConvArray Char),"
216 , " encoderArray :: !(CompactArray Char Word8)"
221 -------------------------------------------
222 -- Embed class and associated functions
224 class (Ord a, Enum a, Bounded a, Show a) => Embed a where
227 instance Embed Word8 where
230 instance Embed Word16 where
233 instance Embed Char where
236 -- this is used for the indices of the compressed array.
237 instance Embed Int where
240 showHex' :: Integral a => a -> String
241 showHex' s = "\\x" ++ showHex s ""
243 repDualByte :: Enum c => c -> String
245 | n >= 2^(16::Int) = error "value is too high!"
246 -- NOTE : this assumes little-endian architecture. But we're only using this on Windows,
247 -- so it's probably OK.
248 | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)