2 This is a script to generate the necessary tables to support Windows code page
5 License: see libraries/base/LICENSE
7 The code page tables are available from :
8 http://www.unicode.org/Public/MAPPINGS/
10 To run this script, use e.g.
11 runghc MakeTable.hs <module-name> <output-file> <codepage-dir>/*.TXT
13 Currently, this script only supports single-byte encodings, since the lookup
14 tables required for the CJK double-byte codepages are too large to be
15 statically linked into every executable. We plan to add support for them once
16 GHC is able to produce Windows DLLs.
21 import System.FilePath
22 import qualified Data.Map as Map
24 import Data.Maybe (mapMaybe)
25 import Data.List (intersperse)
28 import Control.Monad.State
29 import System.Environment
30 import Control.Exception(evaluate)
34 moduleName:outFile:files <- getArgs
35 let badFiles = -- These fail with an error like
36 -- MakeTable: Enum.toEnum{Word8}: tag (33088) is outside of bounds (0,255)
37 -- I have no idea what's going on, so for now we just
43 let files' = filter (`notElem` badFiles) files
44 sbes <- mapM readMapAndIx files'
45 putStrLn "Writing output"
46 withBinaryFile outFile WriteMode $ flip hPutStr
47 $ unlines $ makeTableFile moduleName files' sbes
50 putStrLn ("Reading " ++ f)
52 return (codePageNum f, m)
54 -- filenames are assumed to be of the form "CP1250.TXT"
55 codePageNum :: FilePath -> Int
56 codePageNum = read . drop 2 . takeBaseName
58 readMap :: (Ord a, Enum a) => FilePath -> IO (Map.Map a Char)
59 readMap f = withBinaryFile f ReadMode $ \h -> do
60 contents <- hGetContents h
61 let ms = Map.fromList $ mapMaybe parseLine $ lines contents
62 evaluate $ Map.size ms
65 parseLine :: Enum a => String -> Maybe (a,Char)
66 parseLine s = case words s of
68 bs:"#DBCS":_ -> Just (readHex' bs, toEnum 0xDC00)
69 bs:"#UNDEFINED":_ -> Just (readHex' bs, toEnum 0)
70 bs:cs:('#':_):_ -> Just (readHex' bs, readCharHex cs)
73 readHex' :: Enum a => String -> a
74 readHex' ('0':'x':s) = case readHex s of
75 [(n,"")] -> toEnum n -- explicitly call toEnum to catch overflow errors.
76 _ -> error $ "Can't read hex: " ++ show s
77 readHex' s = error $ "Can't read hex: " ++ show s
79 readCharHex :: String -> Char
80 readCharHex s = if c > fromEnum (maxBound :: Word16)
81 then error "Can't handle non-BMP character."
86 -------------------------------------------
87 -- Writing out the main data values.
89 makeTableFile :: String -> [FilePath] -> [(Int,Map.Map Word8 Char)] -> [String]
90 makeTableFile moduleName files maps = concat
91 [ languageDirectives, firstComment files, header,
92 theImports, theTypes, blockSizeText, tablePart]
94 header = [ "module " ++ moduleName ++ " where"
97 tablePart = [ "codePageMap :: [(Word32, CodePageArrays)]"
99 ] ++ (intersperse "\n ," $ map mkTableEntry maps)
101 mkTableEntry (i,m) = " (" ++ show i ++ ", " ++ makeSBE m ++ " )"
102 blockSizeText = ["blockBitSize :: Int", "blockBitSize = " ++ show blockBitSize]
105 makeSBE :: Map.Map Word8 Char -> String
108 , " decoderArray = " ++ mkConvArray es
109 , " , encoderArray = " ++ mkCompactArray (swapMap m)
113 es = [Map.findWithDefault '\0' x m | x <- [minBound..maxBound]]
115 swapMap :: (Ord a, Ord b, Enum a, Enum b) => Map.Map a b -> Map.Map b a
116 swapMap = Map.insert (toEnum 0) (toEnum 0) . Map.fromList . map swap . Map.toList
121 mkConvArray :: Embed a => [a] -> String
122 mkConvArray xs = "ConvArray \"" ++ concatMap mkHex xs ++ "\"#"
125 -------------------------------------------
128 -- The decoding map (from Word8 to Char) can be implemented with a simple array
129 -- of 256 Word16's. Bytes which do not belong to the code page are mapped to
132 -- However, a naive table mapping Char to Word8 would require 2^16 Word8's. We
133 -- can use much less space with the right data structure, since at most 256 of
134 -- those entries are nonzero.
136 -- We use "compact arrays", as described in "Unicode Demystified" by Richard
139 -- Fix a block size S which is a power of two. We compress an array of N
140 -- entries (where N>>S) as follows. First, split the array into blocks of size
141 -- S, then remove all repeate blocks to form the "value" array. Then construct
142 -- a separate "index" array which maps the position of blocks in the old array
143 -- to a position in the value array.
145 -- For example, assume that S=32 we have six blocks ABABCA, each with 32
148 -- Then the compressed table consists of two arrays:
149 -- 1) An array "values", concatenating the unique blocks ABC
150 -- 2) An array "indices" which equals [0,1,0,1,2,0].
152 -- To look up '\100', first calculate divMod 100 32 = (3,4). Since
153 -- indices[3]=1, we look at the second unique block B; thus the encoded byte is
156 -- The upshot of this representation is that the lookup is very quick as it only
157 -- requires two array accesses plus some bit masking/shifting.
159 -- From testing, this is an optimal size.
163 mkCompactArray :: (Embed a, Embed b) => Map.Map a b -> String
164 mkCompactArray m = unlines [
167 , " encoderIndices = " ++ mkConvArray is'
168 , " , encoderValues = "
169 ++ mkConvArray (concat $ Map.elems vs)
170 , " , encoderMax = " ++ show (fst $ Map.findMax m)
174 blockSize = 2 ^ blockBitSize
175 (is,(vs,_)) = compress blockSize $ m
176 is' = map (* blockSize) is
178 type CompressState b = (Map.Map Int [b], Map.Map [b] Int)
179 -- each entry in the list corresponds to a block of size n.
180 compress :: (Bounded a, Enum a, Ord a, Bounded b, Ord b) => Int -> Map.Map a b
181 -> ([Int], CompressState b)
182 compress n ms = runState (mapM lookupOrAdd chunks) (Map.empty, Map.empty)
184 chunks = mkChunks $ map (\i -> Map.findWithDefault minBound i ms)
185 $ [minBound..fst (Map.findMax ms)]
187 mkChunks xs = take n xs : mkChunks (drop n xs)
190 case Map.lookup xs rm of
193 let i = if Map.null m
195 else 1 + fst (Map.findMax m)
196 put (Map.insert i xs m, Map.insert xs i rm)
199 -------------------------------------------
200 -- Static parts of the generated module.
202 languageDirectives :: [String]
203 languageDirectives = ["{-# LANGUAGE MagicHash #-}"]
206 firstComment :: [FilePath] -> [String]
207 firstComment files = map ("-- " ++) $
208 [ "Do not edit this file directly!"
209 , "It was generated by the MakeTable.hs script using the files below."
210 , "To regenerate it, run \"make\" in ../../../../codepages/"
213 ] ++ map takeFileName files
215 theImports :: [String]
216 theImports = map ("import " ++ )
217 ["GHC.Prim", "GHC.Base", "GHC.Word"]
220 theTypes = [ "data ConvArray a = ConvArray Addr#"
221 , "data CompactArray a b = CompactArray {"
222 , " encoderMax :: !a,"
223 , " encoderIndices :: !(ConvArray Int),"
224 , " encoderValues :: !(ConvArray b)"
227 , "data CodePageArrays = SingleByteCP {"
228 , " decoderArray :: !(ConvArray Char),"
229 , " encoderArray :: !(CompactArray Char Word8)"
234 -------------------------------------------
235 -- Embed class and associated functions
237 class (Ord a, Enum a, Bounded a, Show a) => Embed a where
240 instance Embed Word8 where
243 instance Embed Word16 where
246 instance Embed Char where
249 -- this is used for the indices of the compressed array.
250 instance Embed Int where
253 showHex' :: Integral a => a -> String
254 showHex' s = "\\x" ++ showHex s ""
256 repDualByte :: Enum c => c -> String
258 | n >= 2^(16::Int) = error "value is too high!"
259 -- NOTE : this assumes little-endian architecture. But we're only using this on Windows,
260 -- so it's probably OK.
261 | otherwise = showHex' (n `mod` 256) ++ showHex' (n `div` 256)