From 9d7da331989abcd1844e9d03b8d1e4163796fa85 Mon Sep 17 00:00:00 2001 From: simonmar Date: Fri, 6 Jan 2006 16:30:19 +0000 Subject: [PATCH] [project @ 2006-01-06 16:30:17 by simonmar] Add support for UTF-8 source files GHC finally has support for full Unicode in source files. Source files are now assumed to be UTF-8 encoded, and the full range of Unicode characters can be used, with classifications recognised using the implementation from Data.Char. This incedentally means that only the stage2 compiler will recognise Unicode in source files, because I was too lazy to port the unicode classifier code into libcompat. Additionally, the following synonyms for keywords are now recognised: forall symbol (U+2200) forall right arrow (U+2192) -> left arrow (U+2190) <- horizontal ellipsis (U+22EF) .. there are probably more things we could add here. This will break some source files if Latin-1 characters are being used. In most cases this should result in a UTF-8 decoding error. Later on if we want to support more encodings (perhaps with a pragma to specify the encoding), I plan to do it by recoding into UTF-8 before parsing. Internally, there were some pretty big changes: - FastStrings are now stored in UTF-8 - Z-encoding has been moved right to the back end. Previously we used to Z-encode every identifier on the way in for simplicity, and only decode when we needed to show something to the user. Instead, we now keep every string in its UTF-8 encoding, and Z-encode right before printing it out. To avoid Z-encoding the same string multiple times, the Z-encoding is cached inside the FastString the first time it is requested. This speeds up the compiler - I've measured some definite improvement in parsing at least, and I expect compilations overall to be faster too. It also cleans up a lot of cruft from the OccName interface. Z-encoding is nicely hidden inside the Outputable instance for Names & OccNames now. - StringBuffers are UTF-8 too, and are now represented as ForeignPtrs. - I've put together some test cases, not by any means exhaustive, but there are some interesting UTF-8 decoding error cases that aren't obvious. Also, take a look at unicode001.hs for a demo. --- ghc/compiler/HsVersions.h | 14 +- ghc/compiler/Makefile | 2 +- ghc/compiler/basicTypes/Id.lhs | 19 +- ghc/compiler/basicTypes/Literal.lhs | 9 +- ghc/compiler/basicTypes/MkId.lhs | 4 +- ghc/compiler/basicTypes/Module.lhs | 40 +- ghc/compiler/basicTypes/Name.lhs | 38 +- ghc/compiler/basicTypes/OccName.lhs | 493 +++++--------------- ghc/compiler/basicTypes/RdrName.lhs | 26 +- ghc/compiler/cmm/CLabel.hs | 4 +- ghc/compiler/cmm/Cmm.hs | 4 +- ghc/compiler/cmm/CmmLex.x | 4 +- ghc/compiler/cmm/CmmParse.y | 6 +- ghc/compiler/cmm/PprC.hs | 23 +- ghc/compiler/cmm/PprCmm.hs | 4 +- ghc/compiler/codeGen/CgProf.hs | 4 +- ghc/compiler/codeGen/CgUtils.hs | 13 +- ghc/compiler/codeGen/ClosureInfo.lhs | 6 +- ghc/compiler/deSugar/Check.lhs | 5 +- ghc/compiler/deSugar/DsForeign.lhs | 9 +- ghc/compiler/deSugar/DsMeta.hs | 12 +- ghc/compiler/deSugar/DsUtils.lhs | 13 +- ghc/compiler/ghci/ByteCodeGen.lhs | 25 +- ghc/compiler/ghci/ByteCodeLink.lhs | 12 +- ghc/compiler/ghci/InteractiveUI.hs | 10 +- ghc/compiler/hsSyn/Convert.lhs | 7 +- ghc/compiler/hsSyn/HsDecls.lhs | 4 +- ghc/compiler/hsSyn/HsUtils.lhs | 4 +- ghc/compiler/iface/LoadIface.lhs | 4 +- ghc/compiler/iface/MkIface.lhs | 6 +- ghc/compiler/main/DriverMkDepend.hs | 2 +- ghc/compiler/main/DriverPipeline.hs | 4 +- ghc/compiler/main/Finder.lhs | 6 +- ghc/compiler/main/GHC.hs | 2 +- ghc/compiler/main/HscTypes.lhs | 2 +- ghc/compiler/nativeGen/PprMach.hs | 12 +- ghc/compiler/ndpFlatten/FlattenMonad.hs | 6 +- ghc/compiler/parser/Ctype.lhs | 192 ++++---- ghc/compiler/parser/Lexer.x | 240 ++++++---- ghc/compiler/parser/Parser.y.pp | 6 +- ghc/compiler/parser/ParserCore.y | 12 +- ghc/compiler/parser/RdrHsSyn.lhs | 9 +- ghc/compiler/prelude/PrelNames.lhs | 21 +- ghc/compiler/prelude/PrelRules.lhs | 4 +- ghc/compiler/prelude/PrimOp.lhs | 10 +- ghc/compiler/prelude/TysPrim.lhs | 4 +- ghc/compiler/prelude/TysWiredIn.lhs | 18 +- ghc/compiler/profiling/CostCentre.lhs | 12 +- ghc/compiler/rename/RnEnv.lhs | 9 +- ghc/compiler/rename/RnExpr.lhs | 3 +- ghc/compiler/rename/RnNames.lhs | 12 +- ghc/compiler/simplCore/SetLevels.lhs | 12 +- ghc/compiler/simplCore/SimplMonad.lhs | 3 +- ghc/compiler/simplCore/Simplify.lhs | 3 +- ghc/compiler/stgSyn/CoreToStg.lhs | 4 +- ghc/compiler/typecheck/Inst.lhs | 6 +- ghc/compiler/typecheck/TcClassDcl.lhs | 3 +- ghc/compiler/typecheck/TcGenDeriv.lhs | 12 +- ghc/compiler/typecheck/TcInstDcls.lhs | 3 +- ghc/compiler/typecheck/TcRnDriver.lhs | 6 +- ghc/compiler/typecheck/TcSplice.lhs | 8 +- ghc/compiler/types/TypeRep.lhs | 4 +- ghc/compiler/utils/Binary.hs | 87 +--- ghc/compiler/utils/BufWrite.hs | 25 +- ghc/compiler/utils/Encoding.hs | 386 ++++++++++++++++ ghc/compiler/utils/FastString.lhs | 740 +++++++++++++++---------------- ghc/compiler/utils/FastTypes.lhs | 2 +- ghc/compiler/utils/Pretty.lhs | 12 +- ghc/compiler/utils/PrimPacked.lhs | 265 ----------- ghc/compiler/utils/StringBuffer.lhs | 221 ++++----- ghc/compiler/utils/UnicodeUtil.lhs | 36 -- 71 files changed, 1528 insertions(+), 1720 deletions(-) create mode 100644 ghc/compiler/utils/Encoding.hs delete mode 100644 ghc/compiler/utils/PrimPacked.lhs delete mode 100644 ghc/compiler/utils/UnicodeUtil.lhs diff --git a/ghc/compiler/HsVersions.h b/ghc/compiler/HsVersions.h index 1c07d2c..dd80922 100644 --- a/ghc/compiler/HsVersions.h +++ b/ghc/compiler/HsVersions.h @@ -90,11 +90,19 @@ name = Util.global (value) :: IORef (ty); \ -- when compiling FastString itself #ifndef COMPILING_FAST_STRING -- -import qualified FastString +import qualified FastString as FS #endif -#define SLIT(x) (FastString.mkLitString# (x#)) -#define FSLIT(x) (FastString.mkFastString# (x#)) +#define SLIT(x) (FS.mkLitString# (x#)) +#define FSLIT(x) (FS.mkFastString# (x#)) + +-- Useful for declaring arguments to be strict +#define STRICT1(f) f a b c | a `seq` False = undefined +#define STRICT2(f) f a b | a `seq` b `seq` False = undefined +#define STRICT3(f) f a b c | a `seq` b `seq` c `seq` False = undefined +#define STRICT4(f) f a b c d | a `seq` b `seq` c `seq` d `seq` False = undefined +#define STRICT5(f) f a b c d e | a `seq` b `seq` c `seq` d `seq` e `seq` False = undefined +#define STRICT6(f) f a b c d e f | a `seq` b `seq` c `seq` d `seq` e `seq` f `seq` False = undefined #endif /* HsVersions.h */ diff --git a/ghc/compiler/Makefile b/ghc/compiler/Makefile index 4920e16..2b5252a 100644 --- a/ghc/compiler/Makefile +++ b/ghc/compiler/Makefile @@ -408,7 +408,7 @@ SRC_HC_OPTS += -DGHCI -package template-haskell PKG_DEPENDS += template-haskell # Use threaded RTS with GHCi, so threads don't get blocked at the prompt. -SRC_HC_OPTS += -threaded +# SRC_HC_OPTS += -threaded ALL_DIRS += ghci diff --git a/ghc/compiler/basicTypes/Id.lhs b/ghc/compiler/basicTypes/Id.lhs index 0d15b20..10d5a28 100644 --- a/ghc/compiler/basicTypes/Id.lhs +++ b/ghc/compiler/basicTypes/Id.lhs @@ -9,7 +9,7 @@ module Id ( -- Simple construction mkGlobalId, mkLocalId, mkLocalIdWithInfo, - mkSysLocal, mkSysLocalUnencoded, mkUserLocal, mkVanillaGlobal, + mkSysLocal, mkUserLocal, mkVanillaGlobal, mkTemplateLocals, mkTemplateLocalsNum, mkWildId, mkTemplateLocal, mkWorkerId, mkExportedLocalId, @@ -105,15 +105,15 @@ import qualified Demand ( Demand ) import DataCon ( DataCon, isUnboxedTupleCon ) import NewDemand ( Demand, StrictSig, topDmd, topSig, isBottomingSig ) import Name ( Name, OccName, nameIsLocalOrFrom, - mkSystemVarName, mkSystemVarNameEncoded, mkInternalName, - getOccName, getSrcLoc - ) + mkSystemVarName, mkInternalName, getOccName, + getSrcLoc ) import Module ( Module ) -import OccName ( EncodedFS, mkWorkerOcc ) +import OccName ( mkWorkerOcc ) import Maybes ( orElse ) import SrcLoc ( SrcLoc ) import Outputable import Unique ( Unique, mkBuiltinUnique ) +import FastString ( FastString ) import StaticFlags ( opt_NoStateHack ) -- infixl so you can say (id `set` a `set` b) @@ -162,15 +162,10 @@ mkLocalId name ty = mkLocalIdWithInfo name ty vanillaIdInfo -- SysLocal: for an Id being created by the compiler out of thin air... -- UserLocal: an Id with a name the user might recognize... mkUserLocal :: OccName -> Unique -> Type -> SrcLoc -> Id -mkSysLocal :: EncodedFS -> Unique -> Type -> Id +mkSysLocal :: FastString -> Unique -> Type -> Id mkVanillaGlobal :: Name -> Type -> IdInfo -> Id --- for SysLocal, we assume the base name is already encoded, to avoid --- re-encoding the same string over and over again. -mkSysLocal fs uniq ty = mkLocalId (mkSystemVarNameEncoded uniq fs) ty - --- version to use when the faststring needs to be encoded -mkSysLocalUnencoded fs uniq ty = mkLocalId (mkSystemVarName uniq fs) ty +mkSysLocal fs uniq ty = mkLocalId (mkSystemVarName uniq fs) ty mkUserLocal occ uniq ty loc = mkLocalId (mkInternalName uniq occ loc) ty mkVanillaGlobal = mkGlobalId VanillaGlobal diff --git a/ghc/compiler/basicTypes/Literal.lhs b/ghc/compiler/basicTypes/Literal.lhs index 9aab6ee..e83ea9d 100644 --- a/ghc/compiler/basicTypes/Literal.lhs +++ b/ghc/compiler/basicTypes/Literal.lhs @@ -35,7 +35,6 @@ import FastTypes import FastString import Binary -import UnicodeUtil ( stringToUtf8 ) import Ratio ( numerator ) import FastString ( uniqueOfFS, lengthFS ) import DATA_INT ( Int8, Int16, Int32 ) @@ -95,7 +94,11 @@ data Literal = ------------------ -- First the primitive guys MachChar Char -- Char# At least 31 bits - | MachStr FastString + + | MachStr FastString -- A string-literal: stored and emitted + -- UTF-8 encoded, we'll arrange to decode it + -- at runtime. Also emitted with a '\0' + -- terminator. | MachNullAddr -- the NULL pointer, the only pointer value -- that can be represented as a Literal. @@ -206,7 +209,7 @@ mkMachInt64 x = MachInt64 x mkMachWord64 x = MachWord64 x mkStringLit :: String -> Literal -mkStringLit s = MachStr (mkFastString (stringToUtf8 s)) +mkStringLit s = MachStr (mkFastString s) -- stored UTF-8 encoded inIntRange, inWordRange :: Integer -> Bool inIntRange x = x >= tARGET_MIN_INT && x <= tARGET_MAX_INT diff --git a/ghc/compiler/basicTypes/MkId.lhs b/ghc/compiler/basicTypes/MkId.lhs index 02d2559..9d93a67 100644 --- a/ghc/compiler/basicTypes/MkId.lhs +++ b/ghc/compiler/basicTypes/MkId.lhs @@ -59,7 +59,7 @@ import Class ( Class, classTyCon, classSelIds ) import Var ( Id, TyVar, Var ) import VarSet ( isEmptyVarSet, subVarSet, varSetElems ) import Name ( mkFCallName, mkWiredInName, Name, BuiltInSyntax(..) ) -import OccName ( mkOccFS, varName ) +import OccName ( mkOccNameFS, varName ) import PrimOp ( PrimOp, primOpSig, primOpOcc, primOpTag ) import ForeignCall ( ForeignCall ) import DataCon ( DataCon, DataConIds(..), dataConTyVars, @@ -847,7 +847,7 @@ another gun with which to shoot yourself in the foot. \begin{code} mkWiredInIdName mod fs uniq id - = mkWiredInName mod (mkOccFS varName fs) uniq Nothing (AnId id) UserSyntax + = mkWiredInName mod (mkOccNameFS varName fs) uniq Nothing (AnId id) UserSyntax unsafeCoerceName = mkWiredInIdName gHC_PRIM FSLIT("unsafeCoerce#") unsafeCoerceIdKey unsafeCoerceId nullAddrName = mkWiredInIdName gHC_PRIM FSLIT("nullAddr#") nullAddrIdKey nullAddrId diff --git a/ghc/compiler/basicTypes/Module.lhs b/ghc/compiler/basicTypes/Module.lhs index 9145b35..f4e413d 100644 --- a/ghc/compiler/basicTypes/Module.lhs +++ b/ghc/compiler/basicTypes/Module.lhs @@ -4,7 +4,7 @@ Module ~~~~~~~~~~ -Simply the name of a module, represented as a Z-encoded FastString. +Simply the name of a module, represented as a FastString. These are Uniquable, hence we can build FiniteMaps with ModuleNames as the keys. @@ -17,13 +17,11 @@ module Module , ModLocation(..) , addBootSuffix, addBootSuffix_maybe, addBootSuffixLocn - , moduleString -- :: ModuleName -> EncodedString - , moduleUserString -- :: ModuleName -> UserString - , moduleFS -- :: ModuleName -> EncodedFS + , moduleString -- :: ModuleName -> String + , moduleFS -- :: ModuleName -> FastString - , mkModule -- :: UserString -> ModuleName - , mkModuleFS -- :: UserFS -> ModuleName - , mkSysModuleFS -- :: EncodedFS -> ModuleName + , mkModule -- :: String -> ModuleName + , mkModuleFS -- :: FastString -> ModuleName , ModuleEnv , elemModuleEnv, extendModuleEnv, extendModuleEnvList, plusModuleEnv_C @@ -108,7 +106,7 @@ addBootSuffixLocn locn %************************************************************************ \begin{code} -newtype Module = Module EncodedFS +newtype Module = Module FastString -- Haskell module names can include the quote character ', -- so the module names have the z-encoding applied to them @@ -131,30 +129,26 @@ instance Ord Module where instance Outputable Module where ppr = pprModule - pprModule :: Module -> SDoc -pprModule (Module nm) = pprEncodedFS nm +pprModule (Module nm) = + getPprStyle $ \ sty -> + if codeStyle sty + then ftext (zEncodeFS nm) + else ftext nm -moduleFS :: Module -> EncodedFS +moduleFS :: Module -> FastString moduleFS (Module mod) = mod -moduleString :: Module -> EncodedString +moduleString :: Module -> String moduleString (Module mod) = unpackFS mod -moduleUserString :: Module -> UserString -moduleUserString (Module mod) = decode (unpackFS mod) - -- used to be called mkSrcModule -mkModule :: UserString -> Module -mkModule s = Module (mkFastString (encode s)) +mkModule :: String -> Module +mkModule s = Module (mkFastString s) -- used to be called mkSrcModuleFS -mkModuleFS :: UserFS -> Module -mkModuleFS s = Module (encodeFS s) - --- used to be called mkSysModuleFS -mkSysModuleFS :: EncodedFS -> Module -mkSysModuleFS s = Module s +mkModuleFS :: FastString -> Module +mkModuleFS s = Module s \end{code} %************************************************************************ diff --git a/ghc/compiler/basicTypes/Name.lhs b/ghc/compiler/basicTypes/Name.lhs index fd8f2cf..3aeb03b 100644 --- a/ghc/compiler/basicTypes/Name.lhs +++ b/ghc/compiler/basicTypes/Name.lhs @@ -12,7 +12,7 @@ module Name ( Name, -- Abstract BuiltInSyntax(..), mkInternalName, mkSystemName, - mkSystemVarName, mkSystemVarNameEncoded, mkSysTvName, + mkSystemVarName, mkSysTvName, mkFCallName, mkIPName, mkExternalName, mkWiredInName, @@ -38,10 +38,11 @@ module Name ( import {-# SOURCE #-} TypeRep( TyThing ) import OccName -- All of it -import Module ( Module ) +import Module ( Module, moduleFS ) import SrcLoc ( noSrcLoc, wiredInSrcLoc, SrcLoc ) import Unique ( Unique, Uniquable(..), getKey, pprUnique ) import Maybes ( orElse, isJust ) +import FastString ( FastString, zEncodeFS ) import Outputable \end{code} @@ -215,21 +216,16 @@ mkSystemName :: Unique -> OccName -> Name mkSystemName uniq occ = Name { n_uniq = uniq, n_sort = System, n_occ = occ, n_loc = noSrcLoc } -mkSystemVarName :: Unique -> UserFS -> Name -mkSystemVarName uniq fs = mkSystemName uniq (mkVarOcc fs) +mkSystemVarName :: Unique -> FastString -> Name +mkSystemVarName uniq fs = mkSystemName uniq (mkVarOccFS fs) --- Use this version when the string is already encoded. Avoids duplicating --- the string each time a new name is created. -mkSystemVarNameEncoded :: Unique -> EncodedFS -> Name -mkSystemVarNameEncoded uniq fs = mkSystemName uniq (mkSysOccFS varName fs) +mkSysTvName :: Unique -> FastString -> Name +mkSysTvName uniq fs = mkSystemName uniq (mkOccNameFS tvName fs) -mkSysTvName :: Unique -> EncodedFS -> Name -mkSysTvName uniq fs = mkSystemName uniq (mkSysOccFS tvName fs) - -mkFCallName :: Unique -> EncodedString -> Name +mkFCallName :: Unique -> String -> Name -- The encoded string completely describes the ccall mkFCallName uniq str = Name { n_uniq = uniq, n_sort = Internal, - n_occ = mkFCallOcc str, n_loc = noSrcLoc } + n_occ = mkVarOcc str, n_loc = noSrcLoc } mkIPName :: Unique -> OccName -> Name mkIPName uniq occ @@ -317,13 +313,13 @@ pprName (Name {n_sort = sort, n_uniq = uniq, n_occ = occ}) Internal -> pprInternal sty uniq occ pprExternal sty uniq mod occ is_wired is_builtin - | codeStyle sty = ppr mod <> char '_' <> ppr_occ_name occ + | codeStyle sty = ppr_z_module mod <> char '_' <> ppr_z_occ_name occ -- In code style, always qualify -- ToDo: maybe we could print all wired-in things unqualified -- in code style, to reduce symbol table bloat? | debugStyle sty = ppr mod <> dot <> ppr_occ_name occ <> braces (hsep [if is_wired then ptext SLIT("(w)") else empty, - text (briefOccNameFlavour occ), + pprNameSpaceBrief (occNameSpace occ), pprUnique uniq]) | BuiltInSyntax <- is_builtin = ppr_occ_name occ -- never qualify builtin syntax @@ -332,7 +328,7 @@ pprExternal sty uniq mod occ is_wired is_builtin pprInternal sty uniq occ | codeStyle sty = pprUnique uniq - | debugStyle sty = ppr_occ_name occ <> braces (hsep [text (briefOccNameFlavour occ), + | debugStyle sty = ppr_occ_name occ <> braces (hsep [pprNameSpaceBrief (occNameSpace occ), pprUnique uniq]) | dumpStyle sty = ppr_occ_name occ <> char '_' <> pprUnique uniq -- For debug dumps, we're not necessarily dumping @@ -343,15 +339,21 @@ pprInternal sty uniq occ pprSystem sty uniq occ | codeStyle sty = pprUnique uniq | debugStyle sty = ppr_occ_name occ <> char '_' <> pprUnique uniq - <> braces (text (briefOccNameFlavour occ)) + <> braces (pprNameSpaceBrief (occNameSpace occ)) | otherwise = ppr_occ_name occ <> char '_' <> pprUnique uniq -- If the tidy phase hasn't run, the OccName -- is unlikely to be informative (like 's'), -- so print the unique -ppr_occ_name occ = pprEncodedFS (occNameFS occ) +ppr_occ_name occ = ftext (occNameFS occ) -- Don't use pprOccName; instead, just print the string of the OccName; -- we print the namespace in the debug stuff above + +-- In code style, we Z-encode the strings. The results of Z-encoding each FastString are +-- cached behind the scenes in the FastString implementation. +ppr_z_occ_name occ = ftext (zEncodeFS (occNameFS occ)) +ppr_z_module mod = ftext (zEncodeFS (moduleFS mod)) + \end{code} %************************************************************************ diff --git a/ghc/compiler/basicTypes/OccName.lhs b/ghc/compiler/basicTypes/OccName.lhs index 756d6a9..bd6d3f7 100644 --- a/ghc/compiler/basicTypes/OccName.lhs +++ b/ghc/compiler/basicTypes/OccName.lhs @@ -7,62 +7,66 @@ \begin{code} module OccName ( - -- The NameSpace type; abstact + -- * The NameSpace type; abstact NameSpace, tcName, clsName, tcClsName, dataName, varName, - tvName, srcDataName, nameSpaceString, + tvName, srcDataName, - -- The OccName type + -- ** Printing + pprNameSpace, pprNonVarNameSpace, pprNameSpaceBrief, + + -- * The OccName type OccName, -- Abstract, instance of Outputable pprOccName, + -- ** Construction + mkOccName, mkOccNameFS, + mkVarOcc, mkVarOccFS, + mkTyVarOcc, + mkDFunOcc, + mkTupleOcc, + setOccNameSpace, + + -- ** Derived OccNames + mkDataConWrapperOcc, mkWorkerOcc, mkDefaultMethodOcc, mkDerivedTyConOcc, + mkClassTyConOcc, mkClassDataConOcc, mkDictOcc, mkIPOcc, + mkSpecOcc, mkForeignExportOcc, mkGenOcc1, mkGenOcc2, + mkDataTOcc, mkDataCOcc, mkDataConWorkerOcc, + mkSuperDictSelOcc, mkLocalOcc, mkMethodOcc, + + -- ** Deconstruction + occNameFS, occNameString, occNameSpace, + + isVarOcc, isTvOcc, isTcOcc, isDataOcc, isDataSymOcc, isSymOcc, isValOcc, + parenSymOcc, reportIfUnused, isTcClsName, isVarName, + + isTupleOcc_maybe, + -- The OccEnv type OccEnv, emptyOccEnv, unitOccEnv, extendOccEnv, lookupOccEnv, mkOccEnv, extendOccEnvList, elemOccEnv, occEnvElts, foldOccEnv, plusOccEnv, plusOccEnv_C, extendOccEnv_C, - -- The OccSet type - OccSet, emptyOccSet, unitOccSet, mkOccSet, extendOccSet, extendOccSetList, + OccSet, emptyOccSet, unitOccSet, mkOccSet, extendOccSet, + extendOccSetList, unionOccSets, unionManyOccSets, minusOccSet, elemOccSet, occSetElts, foldOccSet, isEmptyOccSet, intersectOccSet, intersectsOccSet, - mkOccName, mkOccFS, mkSysOcc, mkSysOccFS, mkFCallOcc, mkKindOccFS, - mkVarOcc, mkVarOccEncoded, mkTyVarOcc, - mkSuperDictSelOcc, mkDFunOcc, mkForeignExportOcc, - mkDictOcc, mkIPOcc, mkWorkerOcc, mkMethodOcc, mkDefaultMethodOcc, - mkDerivedTyConOcc, mkClassTyConOcc, mkClassDataConOcc, mkSpecOcc, - mkGenOcc1, mkGenOcc2, mkLocalOcc, mkDataTOcc, mkDataCOcc, - mkDataConWrapperOcc, mkDataConWorkerOcc, - - isVarOcc, isTvOcc, isTcOcc, isDataOcc, isDataSymOcc, isSymOcc, isValOcc, - parenSymOcc, reportIfUnused, isTcClsName, isVarName, - - occNameFS, occNameString, occNameUserString, occNameSpace, - occNameFlavour, briefOccNameFlavour, - setOccNameSpace, - - mkTupleOcc, isTupleOcc_maybe, - -- Tidying up TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv, - -- Encoding - EncodedString, EncodedFS, UserString, UserFS, encode, encodeFS, decode, pprEncodedFS, - -- The basic form of names isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym, isLexVarId, isLexVarSym, - isLowerISO, isUpperISO, startsVarSym, startsVarId, startsConSym, startsConId ) where #include "HsVersions.h" -import Char ( isDigit, isUpper, isLower, isAlphaNum, ord, chr, digitToInt ) -import Util ( thenCmp ) -import Unique ( Unique, mkUnique, Uniquable(..) ) -import BasicTypes ( Boxity(..), Arity ) -import StaticFlags ( opt_PprStyle_Debug ) +import Util ( thenCmp ) +import Unique ( Unique, mkUnique, Uniquable(..) ) +import BasicTypes ( Boxity(..), Arity ) +import StaticFlags ( opt_PprStyle_Debug ) import UniqFM import UniqSet import FastString @@ -70,34 +74,16 @@ import Outputable import Binary import GLAEXTS -\end{code} - -We hold both module names and identifier names in a 'Z-encoded' form -that makes them acceptable both as a C identifier and as a Haskell -(prefix) identifier. - -They can always be decoded again when printing error messages -or anything else for the user, but it does make sense for it -to be represented here in encoded form, so that when generating -code the encoding operation is not performed on each occurrence. -These type synonyms help documentation. - -\begin{code} -type UserFS = FastString -- As the user typed it -type EncodedFS = FastString -- Encoded form +import Data.Char ( isUpper, isLower, ord ) -type UserString = String -- As the user typed it -type EncodedString = String -- Encoded form +-- Unicode TODO: put isSymbol in libcompat +#if __GLASGOW_HASKELL__ > 604 +import Data.Char ( isSymbol ) +#else +isSymbol = const False +#endif - -pprEncodedFS :: EncodedFS -> SDoc -pprEncodedFS fs - = getPprStyle $ \ sty -> - if userStyle sty || dumpStyle sty - -- ftext (decodeFS fs) would needlessly pack the string again - then text (decode (unpackFS fs)) - else ftext fs \end{code} %************************************************************************ @@ -155,12 +141,20 @@ isVarName TvName = True isVarName VarName = True isVarName other = False +pprNameSpace :: NameSpace -> SDoc +pprNameSpace DataName = ptext SLIT("data constructor") +pprNameSpace VarName = ptext SLIT("variable") +pprNameSpace TvName = ptext SLIT("type variable") +pprNameSpace TcClsName = ptext SLIT("type constructor or class") + +pprNonVarNameSpace :: NameSpace -> SDoc +pprNonVarNameSpace VarName = empty +pprNonVarNameSpace ns = pprNameSpace ns -nameSpaceString :: NameSpace -> String -nameSpaceString DataName = "data constructor" -nameSpaceString VarName = "variable" -nameSpaceString TvName = "type variable" -nameSpaceString TcClsName = "type constructor or class" +pprNameSpaceBrief DataName = char 'd' +pprNameSpaceBrief VarName = char 'v' +pprNameSpaceBrief TvName = ptext SLIT("tv") +pprNameSpaceBrief TcClsName = ptext SLIT("tc") \end{code} @@ -173,7 +167,7 @@ nameSpaceString TcClsName = "type constructor or class" \begin{code} data OccName = OccName { occNameSpace :: !NameSpace - , occNameFS :: !EncodedFS + , occNameFS :: !FastString } \end{code} @@ -201,9 +195,11 @@ instance Outputable OccName where pprOccName :: OccName -> SDoc pprOccName (OccName sp occ) = getPprStyle $ \ sty -> - pprEncodedFS occ <> if debugStyle sty then - braces (text (briefNameSpaceFlavour sp)) - else empty + if codeStyle sty + then ftext (zEncodeFS occ) + else ftext occ <> if debugStyle sty + then braces (pprNameSpaceBrief sp) + else empty \end{code} @@ -211,54 +207,24 @@ pprOccName (OccName sp occ) %* * \subsection{Construction} %* * -%*****p******************************************************************* - -*Sys* things do no encoding; the caller should ensure that the thing is -already encoded - -\begin{code} -mkSysOcc :: NameSpace -> EncodedString -> OccName -mkSysOcc occ_sp str = ASSERT2( alreadyEncoded str, text str ) - OccName occ_sp (mkFastString str) - -mkSysOccFS :: NameSpace -> EncodedFS -> OccName -mkSysOccFS occ_sp fs = ASSERT2( alreadyEncodedFS fs, ppr fs ) - OccName occ_sp fs - -mkFCallOcc :: EncodedString -> OccName --- This version of mkSysOcc doesn't check that the string is already encoded, --- because it will be something like "{__ccall f dyn Int# -> Int#}" --- This encodes a lot into something that then parses like an Id. --- But then alreadyEncoded complains about the braces! -mkFCallOcc str = OccName varName (mkFastString str) - --- Kind constructors get a special function. Uniquely, they are not encoded, --- so that they have names like '*'. This means that *even in interface files* --- we'll get kinds like (* -> (* -> *)). We can't use mkSysOcc because it --- has an ASSERT that doesn't hold. -mkKindOccFS :: NameSpace -> EncodedFS -> OccName -mkKindOccFS occ_sp fs = OccName occ_sp fs -\end{code} - -*Source-code* things are encoded. +%************************************************************************ \begin{code} -mkOccFS :: NameSpace -> UserFS -> OccName -mkOccFS occ_sp fs = mkSysOccFS occ_sp (encodeFS fs) - mkOccName :: NameSpace -> String -> OccName -mkOccName ns s = mkSysOcc ns (encode s) +mkOccName occ_sp str = OccName occ_sp (mkFastString str) -mkVarOcc :: UserFS -> OccName -mkVarOcc fs = mkSysOccFS varName (encodeFS fs) +mkOccNameFS :: NameSpace -> FastString -> OccName +mkOccNameFS occ_sp fs = OccName occ_sp fs -mkTyVarOcc :: UserFS -> OccName -mkTyVarOcc fs = mkSysOccFS tvName (encodeFS fs) +mkVarOcc :: String -> OccName +mkVarOcc s = mkOccName varName s -mkVarOccEncoded :: EncodedFS -> OccName -mkVarOccEncoded fs = mkSysOccFS varName fs -\end{code} +mkVarOccFS :: FastString -> OccName +mkVarOccFS fs = mkOccNameFS varName fs +mkTyVarOcc :: FastString -> OccName +mkTyVarOcc fs = mkOccNameFS tvName fs +\end{code} %************************************************************************ @@ -355,34 +321,13 @@ intersectsOccSet s1 s2 = not (isEmptyOccSet (s1 `intersectOccSet` s2)) %* * %************************************************************************ -\begin{code} -occNameString :: OccName -> EncodedString +\begin{code} +occNameString :: OccName -> String occNameString (OccName _ s) = unpackFS s -occNameUserString :: OccName -> UserString -occNameUserString occ = decode (occNameString occ) - setOccNameSpace :: NameSpace -> OccName -> OccName setOccNameSpace sp (OccName _ occ) = OccName sp occ --- occNameFlavour is used only to generate good error messages -occNameFlavour :: OccName -> SDoc -occNameFlavour (OccName DataName _) = ptext SLIT("data constructor") -occNameFlavour (OccName TvName _) = ptext SLIT("type variable") -occNameFlavour (OccName TcClsName _) = ptext SLIT("type constructor or class") -occNameFlavour (OccName VarName s) = empty - --- briefOccNameFlavour is used in debug-printing of names -briefOccNameFlavour :: OccName -> String -briefOccNameFlavour (OccName sp _) = briefNameSpaceFlavour sp - -briefNameSpaceFlavour DataName = "d" -briefNameSpaceFlavour VarName = "v" -briefNameSpaceFlavour TvName = "tv" -briefNameSpaceFlavour TcClsName = "tc" -\end{code} - -\begin{code} isVarOcc, isTvOcc, isDataSymOcc, isSymOcc, isTcOcc :: OccName -> Bool isVarOcc (OccName VarName _) = True @@ -400,19 +345,19 @@ isValOcc other = False -- Data constructor operator (starts with ':', or '[]') -- Pretty inefficient! -isDataSymOcc (OccName DataName s) = isLexConSym (decodeFS s) -isDataSymOcc (OccName VarName s) = isLexConSym (decodeFS s) +isDataSymOcc (OccName DataName s) = isLexConSym s +isDataSymOcc (OccName VarName s) = isLexConSym s isDataSymOcc other = False isDataOcc (OccName DataName _) = True -isDataOcc (OccName VarName s) = isLexCon (decodeFS s) +isDataOcc (OccName VarName s) = isLexCon s isDataOcc other = False -- Any operator (data constructor or variable) -- Pretty inefficient! -isSymOcc (OccName DataName s) = isLexConSym (decodeFS s) -isSymOcc (OccName TcClsName s) = isLexConSym (decodeFS s) -isSymOcc (OccName VarName s) = isLexSym (decodeFS s) +isSymOcc (OccName DataName s) = isLexConSym s +isSymOcc (OccName TcClsName s) = isLexConSym s +isSymOcc (OccName VarName s) = isLexSym s isSymOcc other = False parenSymOcc :: OccName -> SDoc -> SDoc @@ -426,13 +371,12 @@ parenSymOcc occ doc | isSymOcc occ = parens doc reportIfUnused :: OccName -> Bool -- Haskell 98 encourages compilers to suppress warnings about -- unused names in a pattern if they start with "_". -reportIfUnused occ = case occNameUserString occ of +reportIfUnused occ = case occNameString occ of ('_' : _) -> False - zz_other -> True + _other -> True \end{code} - %************************************************************************ %* * \subsection{Making system names} @@ -466,16 +410,17 @@ NB: The string must already be encoded! \begin{code} mk_deriv :: NameSpace -> String -- Distinguishes one sort of derived name from another - -> EncodedString -- Must be already encoded!! We don't want to encode it a - -- second time because encoding isn't idempotent + -> String -> OccName -mk_deriv occ_sp sys_prefix str = mkSysOcc occ_sp (encode sys_prefix ++ str) +mk_deriv occ_sp sys_prefix str = mkOccName occ_sp (sys_prefix ++ str) \end{code} \begin{code} -mkDictOcc, mkIPOcc, mkWorkerOcc, mkDefaultMethodOcc, - mkClassTyConOcc, mkClassDataConOcc, mkSpecOcc +mkDataConWrapperOcc, mkWorkerOcc, mkDefaultMethodOcc, mkDerivedTyConOcc, + mkClassTyConOcc, mkClassDataConOcc, mkDictOcc, mkIPOcc, + mkSpecOcc, mkForeignExportOcc, mkGenOcc1, mkGenOcc2, + mkDataTOcc, mkDataCOcc, mkDataConWorkerOcc :: OccName -> OccName -- These derived variables have a prefix that no Haskell value could have @@ -503,7 +448,6 @@ mkDataCOcc = mk_simple_deriv varName "$c" mk_simple_deriv sp px occ = mk_deriv sp px (occNameString occ) - -- Data constructor workers are made by setting the name space -- of the data constructor OccName (which should be a DataName) -- to VarName @@ -528,7 +472,7 @@ mkLocalOcc uniq occ \begin{code} -mkDFunOcc :: EncodedString -- Typically the class and type glommed together e.g. "OrdMaybe" +mkDFunOcc :: String -- Typically the class and type glommed together e.g. "OrdMaybe" -- Only used in debug mode, for extra clarity -> Bool -- True <=> hs-boot instance dfun -> Int -- Unique index @@ -609,248 +553,37 @@ tidyOccName in_scope occ@(OccName occ_sp fs) Just n -> -- Already used: make a new guess, -- change the guess base, and try again tidyOccName (extendOccEnv in_scope occ (n+1)) - (mkSysOcc occ_sp (unpackFS fs ++ show n)) + (mkOccName occ_sp (unpackFS fs ++ show n)) \end{code} - -%************************************************************************ -%* * -\subsection{The 'Z' encoding} -%* * -%************************************************************************ - -This is the main name-encoding and decoding function. It encodes any -string into a string that is acceptable as a C name. This is the name -by which things are known right through the compiler. - -The basic encoding scheme is this. - -* Tuples (,,,) are coded as Z3T - -* Alphabetic characters (upper and lower) and digits - all translate to themselves; - except 'Z', which translates to 'ZZ' - and 'z', which translates to 'zz' - We need both so that we can preserve the variable/tycon distinction - -* Most other printable characters translate to 'zx' or 'Zx' for some - alphabetic character x - -* The others translate as 'znnnU' where 'nnn' is the decimal number - of the character - - Before After - -------------------------- - Trak Trak - foo_wib foozuwib - > zg - >1 zg1 - foo# foozh - foo## foozhzh - foo##1 foozhzh1 - fooZ fooZZ - :+ ZCzp - () Z0T 0-tuple - (,,,,) Z5T 5-tuple - (# #) Z1H unboxed 1-tuple (note the space) - (#,,,,#) Z5H unboxed 5-tuple - (NB: There is no Z1T nor Z0H.) - -\begin{code} --- alreadyEncoded is used in ASSERTs to check for encoded --- strings. It isn't fail-safe, of course, because, say 'zh' might --- be encoded or not. -alreadyEncoded :: String -> Bool -alreadyEncoded s = all ok s - where - ok ' ' = True - -- This is a bit of a lie; if we really wanted spaces - -- in names we'd have to encode them. But we do put - -- spaces in ccall "occurrences", and we don't want to - -- reject them here - ok ch = isAlphaNum ch - -alreadyEncodedFS :: FastString -> Bool -alreadyEncodedFS fs = alreadyEncoded (unpackFS fs) - -encode :: UserString -> EncodedString -encode cs = case maybe_tuple cs of - Just n -> n -- Tuples go to Z2T etc - Nothing -> go cs - where - go [] = [] - go (c:cs) = encode_ch c ++ go cs - -encodeFS :: UserFS -> EncodedFS -encodeFS fast_str | all unencodedChar str = fast_str - | otherwise = mkFastString (encode str) - where - str = unpackFS fast_str - -unencodedChar :: Char -> Bool -- True for chars that don't need encoding -unencodedChar 'Z' = False -unencodedChar 'z' = False -unencodedChar c = c >= 'a' && c <= 'z' - || c >= 'A' && c <= 'Z' - || c >= '0' && c <= '9' - -encode_ch :: Char -> EncodedString -encode_ch c | unencodedChar c = [c] -- Common case first - --- Constructors -encode_ch '(' = "ZL" -- Needed for things like (,), and (->) -encode_ch ')' = "ZR" -- For symmetry with ( -encode_ch '[' = "ZM" -encode_ch ']' = "ZN" -encode_ch ':' = "ZC" -encode_ch 'Z' = "ZZ" - --- Variables -encode_ch 'z' = "zz" -encode_ch '&' = "za" -encode_ch '|' = "zb" -encode_ch '^' = "zc" -encode_ch '$' = "zd" -encode_ch '=' = "ze" -encode_ch '>' = "zg" -encode_ch '#' = "zh" -encode_ch '.' = "zi" -encode_ch '<' = "zl" -encode_ch '-' = "zm" -encode_ch '!' = "zn" -encode_ch '+' = "zp" -encode_ch '\'' = "zq" -encode_ch '\\' = "zr" -encode_ch '/' = "zs" -encode_ch '*' = "zt" -encode_ch '_' = "zu" -encode_ch '%' = "zv" -encode_ch c = 'z' : shows (ord c) "U" -\end{code} - -Decode is used for user printing. - -\begin{code} -decodeFS :: FastString -> FastString -decodeFS fs = mkFastString (decode (unpackFS fs)) - -decode :: EncodedString -> UserString -decode [] = [] -decode ('Z' : d : rest) | isDigit d = decode_tuple d rest - | otherwise = decode_upper d : decode rest -decode ('z' : d : rest) | isDigit d = decode_num_esc d rest - | otherwise = decode_lower d : decode rest -decode (c : rest) = c : decode rest - -decode_upper, decode_lower :: Char -> Char - -decode_upper 'L' = '(' -decode_upper 'R' = ')' -decode_upper 'M' = '[' -decode_upper 'N' = ']' -decode_upper 'C' = ':' -decode_upper 'Z' = 'Z' -decode_upper ch = pprTrace "decode_upper" (char ch) ch - -decode_lower 'z' = 'z' -decode_lower 'a' = '&' -decode_lower 'b' = '|' -decode_lower 'c' = '^' -decode_lower 'd' = '$' -decode_lower 'e' = '=' -decode_lower 'g' = '>' -decode_lower 'h' = '#' -decode_lower 'i' = '.' -decode_lower 'l' = '<' -decode_lower 'm' = '-' -decode_lower 'n' = '!' -decode_lower 'p' = '+' -decode_lower 'q' = '\'' -decode_lower 'r' = '\\' -decode_lower 's' = '/' -decode_lower 't' = '*' -decode_lower 'u' = '_' -decode_lower 'v' = '%' -decode_lower ch = pprTrace "decode_lower" (char ch) ch - --- Characters not having a specific code are coded as z224U -decode_num_esc d rest - = go (digitToInt d) rest - where - go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest - go n ('U' : rest) = chr n : decode rest - go n other = pprPanic "decode_num_esc" (ppr n <+> text other) - -decode_tuple :: Char -> EncodedString -> UserString -decode_tuple d rest - = go (digitToInt d) rest - where - -- NB. recurse back to decode after decoding the tuple, because - -- the tuple might be embedded in a longer name. - go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest - go 0 ('T':rest) = "()" ++ decode rest - go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ decode rest - go 1 ('H':rest) = "(# #)" ++ decode rest - go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ decode rest - go n other = pprPanic "decode_tuple" (ppr n <+> text other) -\end{code} - - %************************************************************************ %* * Stuff for dealing with tuples %* * %************************************************************************ -Tuples are encoded as - Z3T or Z3H -for 3-tuples or unboxed 3-tuples respectively. No other encoding starts - Z - -* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple) - There are no unboxed 0-tuples. - -* "()" is the tycon for a boxed 0-tuple. - There are no boxed 1-tuples. - - -\begin{code} -maybe_tuple :: UserString -> Maybe EncodedString - -maybe_tuple "(# #)" = Just("Z1H") -maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of - (n, '#' : ')' : cs) -> Just ('Z' : shows (n+1) "H") - other -> Nothing -maybe_tuple "()" = Just("Z0T") -maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of - (n, ')' : cs) -> Just ('Z' : shows (n+1) "T") - other -> Nothing -maybe_tuple other = Nothing - -count_commas :: Int -> String -> (Int, String) -count_commas n (',' : cs) = count_commas (n+1) cs -count_commas n cs = (n,cs) -\end{code} - \begin{code} mkTupleOcc :: NameSpace -> Boxity -> Arity -> OccName -mkTupleOcc ns bx ar - = OccName ns (mkFastString ('Z' : (show ar ++ bx_char))) +mkTupleOcc ns bx ar = OccName ns (mkFastString str) where - bx_char = case bx of - Boxed -> "T" - Unboxed -> "H" + -- no need to cache these, the caching is done in the caller + -- (TysWiredIn.mk_tuple) + str = case bx of + Boxed -> '(' : commas ++ ")" + Unboxed -> '(' : '#' : commas ++ "#)" + + commas = take (ar-1) (repeat ',') isTupleOcc_maybe :: OccName -> Maybe (NameSpace, Boxity, Arity) -- Tuples are special, because there are so many of them! isTupleOcc_maybe (OccName ns fs) = case unpackFS fs of - ('Z':d:rest) | isDigit d -> Just (decode_tup (digitToInt d) rest) - other -> Nothing + '(':'#':',':rest -> Just (ns, Unboxed, 2 + count_commas rest) + '(':',':rest -> Just (ns, Boxed, 2 + count_commas rest) + _other -> Nothing where - decode_tup n "H" = (ns, Unboxed, n) - decode_tup n "T" = (ns, Boxed, n) - decode_tup n (d:rest) = decode_tup (n*10 + digitToInt d) rest + count_commas (',':rest) = 1 + count_commas rest + count_commas _ = 0 \end{code} %************************************************************************ @@ -875,37 +608,31 @@ isLexSym cs = isLexConSym cs || isLexVarSym cs ------------- isLexConId cs -- Prefix type or data constructors - | nullFastString cs = False -- e.g. "Foo", "[]", "(,)" + | nullFS cs = False -- e.g. "Foo", "[]", "(,)" | cs == FSLIT("[]") = True | otherwise = startsConId (headFS cs) isLexVarId cs -- Ordinary prefix identifiers - | nullFastString cs = False -- e.g. "x", "_x" + | nullFS cs = False -- e.g. "x", "_x" | otherwise = startsVarId (headFS cs) isLexConSym cs -- Infix type or data constructors - | nullFastString cs = False -- e.g. ":-:", ":", "->" + | nullFS cs = False -- e.g. ":-:", ":", "->" | cs == FSLIT("->") = True | otherwise = startsConSym (headFS cs) isLexVarSym cs -- Infix identifiers - | nullFastString cs = False -- e.g. "+" + | nullFS cs = False -- e.g. "+" | otherwise = startsVarSym (headFS cs) ------------- startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool -startsVarSym c = isSymbolASCII c || isSymbolISO c -- Infix Ids +startsVarSym c = isSymbolASCII c || (ord c > 0x7f && isSymbol c) -- Infix Ids startsConSym c = c == ':' -- Infix data constructors -startsVarId c = isLower c || isLowerISO c || c == '_' -- Ordinary Ids -startsConId c = isUpper c || isUpperISO c || c == '(' -- Ordinary type constructors and data constructors - +startsVarId c = isLower c || c == '_' -- Ordinary Ids +startsConId c = isUpper c || c == '(' -- Ordinary type constructors and data constructors isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-" -isSymbolISO c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf]) -isUpperISO (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'# - --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c -isLowerISO (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'# - --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c \end{code} %************************************************************************ diff --git a/ghc/compiler/basicTypes/RdrName.lhs b/ghc/compiler/basicTypes/RdrName.lhs index 900717e..030aa1f 100644 --- a/ghc/compiler/basicTypes/RdrName.lhs +++ b/ghc/compiler/basicTypes/RdrName.lhs @@ -40,25 +40,17 @@ module RdrName ( #include "HsVersions.h" -import OccName ( NameSpace, varName, - OccName, UserFS, - setOccNameSpace, - mkOccFS, occNameFlavour, - isDataOcc, isTvOcc, isTcOcc, - OccEnv, emptyOccEnv, extendOccEnvList, lookupOccEnv, - elemOccEnv, plusOccEnv_C, extendOccEnv_C, foldOccEnv, - occEnvElts - ) +import OccName import Module ( Module, mkModuleFS ) import Name ( Name, NamedThing(getName), nameModule, nameParent_maybe, nameOccName, isExternalName, nameSrcLoc ) import Maybes ( mapCatMaybes ) import SrcLoc ( isGoodSrcLoc, SrcSpan ) +import FastString ( FastString ) import Outputable import Util ( thenCmp ) \end{code} - %************************************************************************ %* * \subsection{The main data type} @@ -147,14 +139,14 @@ mkDerivedRdrName parent mk_occ --------------- -- These two are used when parsing source files -- They do encode the module and occurrence names -mkUnqual :: NameSpace -> UserFS -> RdrName -mkUnqual sp n = Unqual (mkOccFS sp n) +mkUnqual :: NameSpace -> FastString -> RdrName +mkUnqual sp n = Unqual (mkOccNameFS sp n) -mkVarUnqual :: UserFS -> RdrName -mkVarUnqual n = Unqual (mkOccFS varName n) +mkVarUnqual :: FastString -> RdrName +mkVarUnqual n = Unqual (mkVarOccFS n) -mkQual :: NameSpace -> (UserFS, UserFS) -> RdrName -mkQual sp (m, n) = Qual (mkModuleFS m) (mkOccFS sp n) +mkQual :: NameSpace -> (FastString, FastString) -> RdrName +mkQual sp (m, n) = Qual (mkModuleFS m) (mkOccNameFS sp n) getRdrName :: NamedThing thing => thing -> RdrName getRdrName name = nameRdrName (getName name) @@ -213,7 +205,7 @@ instance Outputable RdrName where ppr (Qual mod occ) = ppr mod <> dot <> ppr occ <+> ppr_name_space occ ppr (Orig mod occ) = ppr mod <> dot <> ppr occ <+> ppr_name_space occ -ppr_name_space occ = ifPprDebug (parens (occNameFlavour occ)) +ppr_name_space occ = ifPprDebug (parens (pprNonVarNameSpace (occNameSpace occ))) instance OutputableBndr RdrName where pprBndr _ n diff --git a/ghc/compiler/cmm/CLabel.hs b/ghc/compiler/cmm/CLabel.hs index 296ad91..fb13589 100644 --- a/ghc/compiler/cmm/CLabel.hs +++ b/ghc/compiler/cmm/CLabel.hs @@ -740,10 +740,10 @@ pprCLbl (CC_Label cc) = ppr cc pprCLbl (CCS_Label ccs) = ppr ccs pprCLbl (ModuleInitLabel mod way _) - = ptext SLIT("__stginit_") <> ftext (moduleFS mod) + = ptext SLIT("__stginit_") <> ppr mod <> char '_' <> text way pprCLbl (PlainModuleInitLabel mod _) - = ptext SLIT("__stginit_") <> ftext (moduleFS mod) + = ptext SLIT("__stginit_") <> ppr mod ppIdFlavor :: IdLabelInfo -> SDoc ppIdFlavor x = pp_cSEP <> diff --git a/ghc/compiler/cmm/Cmm.hs b/ghc/compiler/cmm/Cmm.hs index 304ddb2..13961c1 100644 --- a/ghc/compiler/cmm/Cmm.hs +++ b/ghc/compiler/cmm/Cmm.hs @@ -30,6 +30,7 @@ import CLabel ( CLabel ) import ForeignCall ( CCallConv ) import Unique ( Unique, Uniquable(..) ) import FastString ( FastString ) +import DATA_WORD ( Word8 ) ----------------------------------------------------------------------------- -- Cmm, CmmTop, CmmBasicBlock @@ -251,9 +252,8 @@ data CmmStatic -- align to next N-byte boundary (N must be a power of 2). | CmmDataLabel CLabel -- label the current position in this section. - | CmmString String + | CmmString [Word8] -- string of 8-bit values only, not zero terminated. - -- ToDo: might be more honest to use [Word8] here? ----------------------------------------------------------------------------- -- Global STG registers diff --git a/ghc/compiler/cmm/CmmLex.x b/ghc/compiler/cmm/CmmLex.x index 8515b3e..c2efd17 100644 --- a/ghc/compiler/cmm/CmmLex.x +++ b/ghc/compiler/cmm/CmmLex.x @@ -227,10 +227,10 @@ tok_decimal span buf len = return (L span (CmmT_Int $! parseInteger buf len 10 octDecDigit)) tok_octal span buf len - = return (L span (CmmT_Int $! parseInteger (stepOn buf) (len-1) 8 octDecDigit)) + = return (L span (CmmT_Int $! parseInteger (offsetBytes 1 buf) (len-1) 8 octDecDigit)) tok_hexadecimal span buf len - = return (L span (CmmT_Int $! parseInteger (stepOnBy 2 buf) (len-2) 16 hexDigit)) + = return (L span (CmmT_Int $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit)) tok_float str = CmmT_Float $! readRational str diff --git a/ghc/compiler/cmm/CmmParse.y b/ghc/compiler/cmm/CmmParse.y index 45f411b..cfb2a9d 100644 --- a/ghc/compiler/cmm/CmmParse.y +++ b/ghc/compiler/cmm/CmmParse.y @@ -48,6 +48,7 @@ import Constants ( wORD_SIZE ) import Outputable import Monad ( when ) +import Data.Char ( ord ) #include "HsVersions.h" } @@ -177,7 +178,7 @@ static :: { ExtFCode [CmmStatic] } return [CmmStaticLit (getLit e)] } | type ';' { return [CmmUninitialised (machRepByteWidth $1)] } - | 'bits8' '[' ']' STRING ';' { return [CmmString $4] } + | 'bits8' '[' ']' STRING ';' { return [mkString $4] } | 'bits8' '[' INT ']' ';' { return [CmmUninitialised (fromIntegral $3)] } | typenot8 '[' INT ']' ';' { return [CmmUninitialised @@ -427,6 +428,9 @@ section "rodata" = ReadOnlyData section "bss" = UninitialisedData section s = OtherSection s +mkString :: String -> CmmStatic +mkString s = CmmString (map (fromIntegral.ord) s) + -- mkMachOp infers the type of the MachOp from the type of its first -- argument. We assume that this is correct: for MachOps that don't have -- symmetrical args (e.g. shift ops), the first arg determines the type of diff --git a/ghc/compiler/cmm/PprC.hs b/ghc/compiler/cmm/PprC.hs index 7427f50..9fece36 100644 --- a/ghc/compiler/cmm/PprC.hs +++ b/ghc/compiler/cmm/PprC.hs @@ -45,6 +45,7 @@ import Data.Bits ( shiftR ) import Char ( ord, chr ) import IO ( Handle ) import DATA_BITS +import Data.Word ( Word8 ) #ifdef DEBUG import PprCmm () -- instances only @@ -881,25 +882,21 @@ machRepSignedCType r | r == wordRep = ptext SLIT("I_") -- --------------------------------------------------------------------- -- print strings as valid C strings --- Assumes it contains only characters '\0'..'\xFF'! -pprFSInCStyle :: FastString -> SDoc -pprFSInCStyle fs = pprStringInCStyle (unpackFS fs) - -pprStringInCStyle :: String -> SDoc +pprStringInCStyle :: [Word8] -> SDoc pprStringInCStyle s = doubleQuotes (text (concatMap charToC s)) -charToC :: Char -> String -charToC '\"' = "\\\"" -charToC '\'' = "\\\'" -charToC '\\' = "\\\\" -charToC c | c >= ' ' && c <= '~' = [c] - | c > '\xFF' = panic ("charToC "++show c) - | otherwise = ['\\', +charToC :: Word8 -> String +charToC w = + case chr (fromIntegral w) of + '\"' -> "\\\"" + '\'' -> "\\\'" + '\\' -> "\\\\" + c | c >= ' ' && c <= '~' -> [c] + | otherwise -> ['\\', chr (ord '0' + ord c `div` 64), chr (ord '0' + ord c `div` 8 `mod` 8), chr (ord '0' + ord c `mod` 8)] - -- --------------------------------------------------------------------------- -- Initialising static objects with floating-point numbers. We can't -- just emit the floating point number, because C will cast it to an int diff --git a/ghc/compiler/cmm/PprCmm.hs b/ghc/compiler/cmm/PprCmm.hs index 3c3e976..6e8367d 100644 --- a/ghc/compiler/cmm/PprCmm.hs +++ b/ghc/compiler/cmm/PprCmm.hs @@ -51,6 +51,7 @@ import FastString ( mkFastString ) import Data.List ( intersperse, groupBy ) import IO ( Handle ) import Maybe ( isJust ) +import Data.Char ( chr ) pprCmms :: [Cmm] -> SDoc pprCmms cmms = pprCode CStyle (vcat (intersperse separator $ map ppr cmms)) @@ -391,7 +392,8 @@ pprStatic s = case s of CmmUninitialised i -> nest 4 $ text "I8" <> brackets (int i) CmmAlign i -> nest 4 $ text "align" <+> int i CmmDataLabel clbl -> pprCLabel clbl <> colon - CmmString s' -> nest 4 $ text "I8[]" <+> doubleQuotes (text s') + CmmString s' -> nest 4 $ text "I8[]" <+> + doubleQuotes (text (map (chr.fromIntegral) s')) -- -------------------------------------------------------------------------- -- Registers, whether local (temps) or global diff --git a/ghc/compiler/codeGen/CgProf.hs b/ghc/compiler/codeGen/CgProf.hs index aaab2fc..1488e34 100644 --- a/ghc/compiler/codeGen/CgProf.hs +++ b/ghc/compiler/codeGen/CgProf.hs @@ -43,7 +43,7 @@ import MachOp import CmmUtils ( zeroCLit, mkIntCLit, mkLblExpr ) import CLabel ( mkCCLabel, mkCCSLabel, mkRtsDataLabel ) -import Module ( moduleUserString ) +import Module ( moduleString ) import Id ( Id ) import CostCentre import StgSyn ( GenStgExpr(..), StgExpr ) @@ -292,7 +292,7 @@ emitCostCentreDecl -> Code emitCostCentreDecl cc = do { label <- mkStringCLit (costCentreUserName cc) - ; modl <- mkStringCLit (moduleUserString (cc_mod cc)) + ; modl <- mkStringCLit (moduleString (cc_mod cc)) ; let lits = [ zero, -- StgInt ccID, label, -- char *label, diff --git a/ghc/compiler/codeGen/CgUtils.hs b/ghc/compiler/codeGen/CgUtils.hs index 68958d2..2f69927 100644 --- a/ghc/compiler/codeGen/CgUtils.hs +++ b/ghc/compiler/codeGen/CgUtils.hs @@ -54,11 +54,12 @@ import ListSetOps ( assocDefault ) import Util ( filterOut, sortLe ) import DynFlags ( DynFlags(..), HscTarget(..) ) import Packages ( HomeModules ) -import FastString ( LitString, FastString, unpackFS ) +import FastString ( LitString, FastString, bytesFS ) import Outputable import Char ( ord ) import DATA_BITS +import DATA_WORD ( Word8 ) import Maybe ( isNothing ) ------------------------------------------------------------------------- @@ -77,7 +78,8 @@ addIdReps ids = [(idCgRep id, id) | id <- ids] ------------------------------------------------------------------------- cgLit :: Literal -> FCode CmmLit -cgLit (MachStr s) = mkStringCLit (unpackFS s) +cgLit (MachStr s) = mkByteStringCLit (bytesFS s) + -- not unpackFS; we want the UTF-8 byte stream. cgLit other_lit = return (mkSimpleLit other_lit) mkSimpleLit :: Literal -> CmmLit @@ -308,10 +310,13 @@ emitRODataLits lbl lits mkStringCLit :: String -> FCode CmmLit -- Make a global definition for the string, -- and return its label -mkStringCLit str +mkStringCLit str = mkByteStringCLit (map (fromIntegral.ord) str) + +mkByteStringCLit :: [Word8] -> FCode CmmLit +mkByteStringCLit bytes = do { uniq <- newUnique ; let lbl = mkStringLitLabel uniq - ; emitData ReadOnlyData [CmmDataLabel lbl, CmmString str] + ; emitData ReadOnlyData [CmmDataLabel lbl, CmmString bytes] ; return (CmmLabel lbl) } ------------------------------------------------------------------------- diff --git a/ghc/compiler/codeGen/ClosureInfo.lhs b/ghc/compiler/codeGen/ClosureInfo.lhs index b0e9e23..a5362e6 100644 --- a/ghc/compiler/codeGen/ClosureInfo.lhs +++ b/ghc/compiler/codeGen/ClosureInfo.lhs @@ -69,7 +69,7 @@ import StaticFlags ( opt_SccProfilingOn, opt_OmitBlackHoling, import Id ( Id, idType, idArity, idName ) import DataCon ( DataCon, dataConTyCon, isNullaryRepDataCon, dataConName ) import Name ( Name, nameUnique, getOccName, getOccString ) -import OccName ( occNameUserString ) +import OccName ( occNameString ) import Type ( isUnLiftedType, Type, repType, splitTyConApp_maybe ) import TcType ( tcSplitSigmaTy ) import TyCon ( isFunTyCon, isAbstractTyCon ) @@ -930,12 +930,12 @@ closureValDescr, closureTypeDescr :: ClosureInfo -> String closureValDescr (ClosureInfo {closureDescr = descr}) = descr closureValDescr (ConInfo {closureCon = con}) - = occNameUserString (getOccName con) + = occNameString (getOccName con) closureTypeDescr (ClosureInfo { closureType = ty }) = getTyDescription ty closureTypeDescr (ConInfo { closureCon = data_con }) - = occNameUserString (getOccName (dataConTyCon data_con)) + = occNameString (getOccName (dataConTyCon data_con)) getTyDescription :: Type -> String getTyDescription ty diff --git a/ghc/compiler/deSugar/Check.lhs b/ghc/compiler/deSugar/Check.lhs index 309aab2..6d7784d 100644 --- a/ghc/compiler/deSugar/Check.lhs +++ b/ghc/compiler/deSugar/Check.lhs @@ -18,7 +18,8 @@ import DsUtils ( EquationInfo(..), MatchResult(..), import MatchLit ( tidyLitPat, tidyNPat ) import Id ( Id, idType ) import DataCon ( DataCon, dataConTyCon, dataConOrigArgTys, dataConFieldLabels ) -import Name ( Name, mkInternalName, getOccName, isDataSymOcc, getName, mkVarOcc ) +import Name ( Name, mkInternalName, getOccName, isDataSymOcc, + getName, mkVarOccFS ) import TysWiredIn import PrelNames ( unboundKey ) import TyCon ( tyConDataCons, tupleTyConBoxity, isTupleTyCon ) @@ -382,7 +383,7 @@ make_row_vars used_lits (_, EqnInfo { eqn_pats = pats}) new_var = hash_x hash_x = mkInternalName unboundKey {- doesn't matter much -} - (mkVarOcc FSLIT("#x")) + (mkVarOccFS FSLIT("#x")) noSrcLoc make_row_vars_for_constructor :: (EqnNo, EquationInfo) -> [WarningPat] diff --git a/ghc/compiler/deSugar/DsForeign.lhs b/ghc/compiler/deSugar/DsForeign.lhs index d784eb8..52956a0 100644 --- a/ghc/compiler/deSugar/DsForeign.lhs +++ b/ghc/compiler/deSugar/DsForeign.lhs @@ -28,9 +28,8 @@ import SMRep ( argMachRep, typeCgRep ) import CoreUtils ( exprType, mkInlineMe ) import Id ( Id, idType, idName, mkSysLocal, setInlinePragma ) import Literal ( Literal(..), mkStringLit ) -import Module ( moduleString ) +import Module ( moduleFS ) import Name ( getOccString, NamedThing(..) ) -import OccName ( encodeFS ) import Type ( repType, coreEqType ) import TcType ( Type, mkFunTys, mkForAllTys, mkTyConApp, mkFunTy, tcSplitTyConApp_maybe, @@ -146,7 +145,7 @@ dsFImport id (CImport cconv safety header lib spec) = dsCImport id spec cconv safety no_hdrs `thenDs` \(ids, h, c) -> returnDs (ids, h, c, if no_hdrs then Nothing else Just header) where - no_hdrs = nullFastString header + no_hdrs = nullFS header -- FIXME: the `lib' field is needed for .NET ILX generation when invoking -- routines that are external to the .NET runtime, but GHC doesn't @@ -246,7 +245,7 @@ dsFCall fn_id fcall no_hdrs the_ccall_app = mkFCall ccall_uniq fcall val_args ccall_result_ty work_rhs = mkLams tvs (mkLams work_arg_ids the_ccall_app) work_id = setImpInline no_hdrs $ -- See comments with setImpInline - mkSysLocal (encodeFS FSLIT("$wccall")) work_uniq worker_ty + mkSysLocal FSLIT("$wccall") work_uniq worker_ty -- Build the wrapper work_app = mkApps (mkVarApps (Var work_id) tvs) val_args @@ -356,7 +355,7 @@ dsFExportDynamic id cconv getModuleDs `thenDs` \ mod_name -> let -- hack: need to get at the name of the C stub we're about to generate. - fe_nm = mkFastString (moduleString mod_name ++ "_" ++ toCName fe_id) + fe_nm = mkFastString (unpackFS (zEncodeFS (moduleFS mod_name)) ++ "_" ++ toCName fe_id) in newSysLocalDs arg_ty `thenDs` \ cback -> dsLookupGlobalId newStablePtrName `thenDs` \ newStablePtrId -> diff --git a/ghc/compiler/deSugar/DsMeta.hs b/ghc/compiler/deSugar/DsMeta.hs index e5e079e..fcbcc78 100644 --- a/ghc/compiler/deSugar/DsMeta.hs +++ b/ghc/compiler/deSugar/DsMeta.hs @@ -30,16 +30,16 @@ import qualified Language.Haskell.TH as TH import HsSyn import Class (FunDep) import PrelNames ( rationalTyConName, integerTyConName, negateName ) -import OccName ( isDataOcc, isTvOcc, occNameUserString ) +import OccName ( isDataOcc, isTvOcc, occNameString ) -- To avoid clashes with DsMeta.varName we must make a local alias for OccName.varName -- we do this by removing varName from the import of OccName above, making -- a qualified instance of OccName and using OccNameAlias.varName where varName -- ws previously used in this file. import qualified OccName -import Module ( Module, mkModule, moduleUserString ) +import Module ( Module, mkModule, moduleString ) import Id ( Id, mkLocalId ) -import OccName ( mkOccFS ) +import OccName ( mkOccNameFS ) import Name ( Name, mkExternalName, localiseName, nameOccName, nameModule, isExternalName, getSrcLoc ) import NameEnv @@ -911,7 +911,7 @@ globalVar name ; MkC uni <- coreIntLit (getKey (getUnique name)) ; rep2 mkNameLName [occ,uni] } where - name_mod = moduleUserString (nameModule name) + name_mod = moduleString (nameModule name) name_occ = nameOccName name mk_varg | OccName.isDataOcc name_occ = mkNameG_dName | OccName.isVarOcc name_occ = mkNameG_vName @@ -963,7 +963,7 @@ wrapNongenSyms binds (MkC body) ; return (NonRec id var) } occNameLit :: Name -> DsM (Core String) -occNameLit n = coreStringLit (occNameUserString (nameOccName n)) +occNameLit n = coreStringLit (occNameString (nameOccName n)) -- %********************************************************************* @@ -1390,7 +1390,7 @@ thSyn = mkModule "Language.Haskell.TH.Syntax" thLib = mkModule "Language.Haskell.TH.Lib" mk_known_key_name mod space str uniq - = mkExternalName uniq mod (mkOccFS space str) + = mkExternalName uniq mod (mkOccNameFS space str) Nothing noSrcLoc libFun = mk_known_key_name thLib OccName.varName diff --git a/ghc/compiler/deSugar/DsUtils.lhs b/ghc/compiler/deSugar/DsUtils.lhs index b77bb96..1465554 100644 --- a/ghc/compiler/deSugar/DsUtils.lhs +++ b/ghc/compiler/deSugar/DsUtils.lhs @@ -69,11 +69,12 @@ import PrelNames ( unpackCStringName, unpackCStringUtf8Name, plusIntegerName, timesIntegerName, smallIntegerDataConName, lengthPName, indexPName ) import Outputable -import UnicodeUtil ( intsToUtf8 ) import SrcLoc ( Located(..), unLoc ) import Util ( isSingleton, notNull, zipEqual, sortWith ) import ListSetOps ( assocDefault ) import FastString + +import Data.Char ( ord ) \end{code} @@ -469,7 +470,7 @@ mkSmallIntegerLit small_integer_data_con i = mkConApp small_integer_data_con [mk mkStringExpr str = mkStringExprFS (mkFastString str) mkStringExprFS str - | nullFastString str + | nullFS str = returnDs (mkNilExpr charTy) | lengthFS str == 1 @@ -478,17 +479,17 @@ mkStringExprFS str in returnDs (mkConsExpr charTy the_char (mkNilExpr charTy)) - | all safeChar int_chars + | all safeChar chars = dsLookupGlobalId unpackCStringName `thenDs` \ unpack_id -> returnDs (App (Var unpack_id) (Lit (MachStr str))) | otherwise = dsLookupGlobalId unpackCStringUtf8Name `thenDs` \ unpack_id -> - returnDs (App (Var unpack_id) (Lit (MachStr (mkFastString (intsToUtf8 int_chars))))) + returnDs (App (Var unpack_id) (Lit (MachStr str))) where - int_chars = unpackIntFS str - safeChar c = c >= 1 && c <= 0xFF + chars = unpackFS str + safeChar c = ord c >= 1 && ord c <= 0xFF \end{code} diff --git a/ghc/compiler/ghci/ByteCodeGen.lhs b/ghc/compiler/ghci/ByteCodeGen.lhs index 4d2fa73..f526ed9 100644 --- a/ghc/compiler/ghci/ByteCodeGen.lhs +++ b/ghc/compiler/ghci/ByteCodeGen.lhs @@ -53,7 +53,8 @@ import OrdList import Constants ( wORD_SIZE ) import Data.List ( intersperse, sortBy, zip4, zip5, partition ) -import Foreign ( Ptr, castPtr, mallocBytes, pokeByteOff, Word8 ) +import Foreign ( Ptr, castPtr, mallocBytes, pokeByteOff, Word8, + withForeignPtr ) import Foreign.C ( CInt ) import Control.Exception ( throwDyn ) @@ -1084,18 +1085,18 @@ pushAtom d p (AnnLit lit) pushStr s = let getMallocvilleAddr = case s of - FastString _ l ba -> - -- sigh, a string in the heap is no good to us. - -- We need a static C pointer, since the type of - -- a string literal is Addr#. So, copy the string - -- into C land and remember the pointer so we can - -- free it later. - let n = I# l - -- CAREFUL! Chars are 32 bits in ghc 4.09+ - in ioToBc (mallocBytes (n+1)) `thenBc` \ ptr -> + FastString _ n _ fp _ -> + -- we could grab the Ptr from the ForeignPtr, + -- but then we have no way to control its lifetime. + -- In reality it'll probably stay alive long enoungh + -- by virtue of the global FastString table, but + -- to be on the safe side we copy the string into + -- a malloc'd area of memory. + ioToBc (mallocBytes (n+1)) `thenBc` \ ptr -> recordMallocBc ptr `thenBc_` ioToBc ( - do memcpy ptr ba (fromIntegral n) + withForeignPtr fp $ \p -> do + memcpy ptr p (fromIntegral n) pokeByteOff ptr n (fromIntegral (ord '\0') :: Word8) return ptr ) @@ -1110,7 +1111,7 @@ pushAtom d p other (pprCoreExpr (deAnnotate (undefined, other))) foreign import ccall unsafe "memcpy" - memcpy :: Ptr a -> ByteArray# -> CInt -> IO () + memcpy :: Ptr a -> Ptr b -> CInt -> IO () -- ----------------------------------------------------------------------------- diff --git a/ghc/compiler/ghci/ByteCodeLink.lhs b/ghc/compiler/ghci/ByteCodeLink.lhs index ee64b8a..875f1d6 100644 --- a/ghc/compiler/ghci/ByteCodeLink.lhs +++ b/ghc/compiler/ghci/ByteCodeLink.lhs @@ -21,10 +21,10 @@ import ObjLink ( lookupSymbol ) import Name ( Name, nameModule, nameOccName, isExternalName ) import NameEnv -import OccName ( occNameString ) +import OccName ( occNameFS ) import PrimOp ( PrimOp, primOpOcc ) -import Module ( moduleString ) -import FastString ( FastString(..), unpackFS ) +import Module ( moduleFS ) +import FastString ( FastString(..), unpackFS, zEncodeFS ) import Outputable import Panic ( GhcException(..) ) @@ -256,12 +256,12 @@ linkFail who what -- HACKS!!! ToDo: cleaner nameToCLabel :: Name -> String{-suffix-} -> String nameToCLabel n suffix - = moduleString (nameModule n) - ++ '_':occNameString (nameOccName n) ++ '_':suffix + = unpackFS (zEncodeFS (moduleFS (nameModule n))) + ++ '_': unpackFS (zEncodeFS (occNameFS (nameOccName n))) ++ '_':suffix primopToCLabel :: PrimOp -> String{-suffix-} -> String primopToCLabel primop suffix - = let str = "GHCziPrimopWrappers_" ++ occNameString (primOpOcc primop) ++ '_':suffix + = let str = "GHCziPrimopWrappers_" ++ unpackFS (zEncodeFS (occNameFS (primOpOcc primop))) ++ '_':suffix in --trace ("primopToCLabel: " ++ str) str \end{code} diff --git a/ghc/compiler/ghci/InteractiveUI.hs b/ghc/compiler/ghci/InteractiveUI.hs index 0bf37dc..8fee9ba 100644 --- a/ghc/compiler/ghci/InteractiveUI.hs +++ b/ghc/compiler/ghci/InteractiveUI.hs @@ -25,10 +25,10 @@ import PprTyThing import Outputable -- for createtags (should these come via GHC?) -import Module( moduleUserString ) -import Name( nameSrcLoc, nameModule, nameOccName ) -import OccName( pprOccName ) -import SrcLoc( isGoodSrcLoc, srcLocFile, srcLocLine, srcLocCol ) +import Module ( moduleString ) +import Name ( nameSrcLoc, nameModule, nameOccName ) +import OccName ( pprOccName ) +import SrcLoc ( isGoodSrcLoc, srcLocFile, srcLocLine, srcLocCol ) -- Other random utilities import Digraph ( flattenSCCs ) @@ -813,7 +813,7 @@ createTagsFile session tagskind tagFile = do is_interpreted <- GHC.moduleIsInterpreted session m -- should we just skip these? when (not is_interpreted) $ - throwDyn (CmdLineError ("module '" ++ moduleUserString m ++ "' is not interpreted")) + throwDyn (CmdLineError ("module '" ++ moduleString m ++ "' is not interpreted")) mbModInfo <- GHC.getModuleInfo session m let unqual diff --git a/ghc/compiler/hsSyn/Convert.lhs b/ghc/compiler/hsSyn/Convert.lhs index 96623bb..9dddd29 100644 --- a/ghc/compiler/hsSyn/Convert.lhs +++ b/ghc/compiler/hsSyn/Convert.lhs @@ -20,7 +20,8 @@ import qualified Name ( Name, mkInternalName, getName ) import Module ( Module, mkModule ) import RdrHsSyn ( mkClassDecl, mkTyData ) import qualified OccName -import OccName ( startsVarId, startsVarSym, startsConId, startsConSym ) +import OccName ( startsVarId, startsVarSym, startsConId, startsConSym, + pprNameSpace ) import SrcLoc ( Located(..), SrcSpan ) import Type ( Type ) import TysWiredIn ( unitTyCon, tupleTyCon, tupleCon, trueDataCon, nilDataCon, consDataCon ) @@ -550,7 +551,7 @@ okOcc ns str@(c:_) badOcc :: OccName.NameSpace -> String -> SDoc badOcc ctxt_ns occ - = ptext SLIT("Illegal") <+> text (OccName.nameSpaceString ctxt_ns) + = ptext SLIT("Illegal") <+> pprNameSpace ctxt_ns <+> ptext SLIT("name:") <+> quotes (text occ) thRdrName :: OccName.NameSpace -> String -> TH.NameFlavour -> RdrName @@ -605,7 +606,7 @@ mk_uniq_occ ns occ uniq -- The packing and unpacking is rather turgid :-( mk_occ :: OccName.NameSpace -> String -> OccName.OccName -mk_occ ns occ = OccName.mkOccFS ns (mkFastString occ) +mk_occ ns occ = OccName.mkOccNameFS ns (mkFastString occ) mk_ghc_ns :: TH.NameSpace -> OccName.NameSpace mk_ghc_ns TH.DataName = OccName.dataName diff --git a/ghc/compiler/hsSyn/HsDecls.lhs b/ghc/compiler/hsSyn/HsDecls.lhs index c977496..5253d11 100644 --- a/ghc/compiler/hsSyn/HsDecls.lhs +++ b/ghc/compiler/hsSyn/HsDecls.lhs @@ -721,8 +721,8 @@ instance Outputable ForeignImport where ptext SLIT("dynamic") pprCEntity _ _ (CWrapper) = ptext SLIT("wrapper") -- - pprLib lib | nullFastString lib = empty - | otherwise = char '[' <> ppr lib <> char ']' + pprLib lib | nullFS lib = empty + | otherwise = char '[' <> ppr lib <> char ']' instance Outputable ForeignExport where ppr (CExport (CExportStatic lbl cconv)) = diff --git a/ghc/compiler/hsSyn/HsUtils.lhs b/ghc/compiler/hsSyn/HsUtils.lhs index 612e57a..f8efa6c 100644 --- a/ghc/compiler/hsSyn/HsUtils.lhs +++ b/ghc/compiler/hsSyn/HsUtils.lhs @@ -27,7 +27,7 @@ import RdrName ( RdrName, getRdrName, mkRdrUnqual ) import Var ( Id ) import Type ( Type ) import DataCon ( DataCon, dataConWrapId, dataConSourceArity ) -import OccName ( mkVarOcc ) +import OccName ( mkVarOccFS ) import Name ( Name ) import BasicTypes ( RecFlag(..) ) import SrcLoc @@ -136,7 +136,7 @@ mkHsOpApp e1 op e2 = OpApp e1 (noLoc (HsVar op)) (error "mkOpApp:fixity") e2 mkHsSplice e = HsSplice unqualSplice e -unqualSplice = mkRdrUnqual (mkVarOcc FSLIT("splice")) +unqualSplice = mkRdrUnqual (mkVarOccFS FSLIT("splice")) -- A name (uniquified later) to -- identify the splice diff --git a/ghc/compiler/iface/LoadIface.lhs b/ghc/compiler/iface/LoadIface.lhs index 356cf22..8c496f7 100644 --- a/ghc/compiler/iface/LoadIface.lhs +++ b/ghc/compiler/iface/LoadIface.lhs @@ -45,7 +45,7 @@ import NameEnv import MkId ( seqId ) import Module ( Module, ModLocation(ml_hi_file), emptyModuleEnv, addBootSuffix_maybe, - extendModuleEnv, lookupModuleEnv, moduleUserString + extendModuleEnv, lookupModuleEnv, moduleString ) import OccName ( OccName, mkOccEnv, lookupOccEnv, mkClassTyConOcc, mkClassDataConOcc, mkSuperDictSelOcc, mkDataConWrapperOcc, mkDataConWorkerOcc ) @@ -312,7 +312,7 @@ loadDecl ignore_prags mod (_version, decl) -- imported name, to fix the module correctly in the cache mk_new_bndr mod mb_parent occ = newGlobalBinder mod occ mb_parent - (importedSrcLoc (moduleUserString mod)) + (importedSrcLoc (moduleString mod)) doc = ptext SLIT("Declaration for") <+> ppr (ifName decl) diff --git a/ghc/compiler/iface/MkIface.lhs b/ghc/compiler/iface/MkIface.lhs index 5be56bf..2f15ee3 100644 --- a/ghc/compiler/iface/MkIface.lhs +++ b/ghc/compiler/iface/MkIface.lhs @@ -214,7 +214,7 @@ import OccName ( OccName, OccEnv, mkOccEnv, lookupOccEnv, emptyOccEnv, isEmptyOccSet, intersectOccSet, intersectsOccSet, occNameFS, isTcOcc ) import Module ( Module, moduleFS, - ModLocation(..), mkSysModuleFS, moduleUserString, + ModLocation(..), mkModuleFS, moduleString, ModuleEnv, emptyModuleEnv, lookupModuleEnv, extendModuleEnv_C ) @@ -726,7 +726,7 @@ mkIfaceExports :: NameSet -> [(Module, [GenAvailInfo OccName])] -- Group by module and sort by occurrence -- This keeps the list in canonical order mkIfaceExports exports - = [ (mkSysModuleFS fs, eltsFM avails) + = [ (mkModuleFS fs, eltsFM avails) | (fs, avails) <- fmToList groupFM ] where @@ -768,7 +768,7 @@ checkOldIface :: HscEnv checkOldIface hsc_env mod_summary source_unchanged maybe_iface = do { showPass (hsc_dflags hsc_env) - ("Checking old interface for " ++ moduleUserString (ms_mod mod_summary)) ; + ("Checking old interface for " ++ moduleString (ms_mod mod_summary)) ; ; initIfaceCheck hsc_env $ check_old_iface mod_summary source_unchanged maybe_iface diff --git a/ghc/compiler/main/DriverMkDepend.hs b/ghc/compiler/main/DriverMkDepend.hs index fe2d8f3..80d906c 100644 --- a/ghc/compiler/main/DriverMkDepend.hs +++ b/ghc/compiler/main/DriverMkDepend.hs @@ -20,7 +20,7 @@ import HscTypes ( HscEnv, IsBootInterface, msObjFilePath, msHsFilePath ) import Packages ( PackageIdH(..) ) import SysTools ( newTempName ) import qualified SysTools -import Module ( Module, ModLocation(..), mkModule, moduleUserString, +import Module ( Module, ModLocation(..), mkModule, addBootSuffix_maybe ) import Digraph ( SCC(..) ) import Finder ( findModule, FindResult(..) ) diff --git a/ghc/compiler/main/DriverPipeline.hs b/ghc/compiler/main/DriverPipeline.hs index 85099e8..171cecf 100644 --- a/ghc/compiler/main/DriverPipeline.hs +++ b/ghc/compiler/main/DriverPipeline.hs @@ -1396,9 +1396,9 @@ getOptionsFromSource file | otherwise -> return [] getOptionsFromStringBuffer :: StringBuffer -> FilePath -> [(Int,String)] -getOptionsFromStringBuffer buffer@(StringBuffer _ len# _) fn = +getOptionsFromStringBuffer buffer@(StringBuffer _ len _) fn = let - ls = lines (lexemeToString buffer (I# len#)) -- lazy, so it's ok + ls = lines (lexemeToString buffer len) -- lazy, so it's ok in look 1 ls where diff --git a/ghc/compiler/main/Finder.lhs b/ghc/compiler/main/Finder.lhs index 81dedb8..fbde40f 100644 --- a/ghc/compiler/main/Finder.lhs +++ b/ghc/compiler/main/Finder.lhs @@ -267,7 +267,7 @@ searchPathExts paths mod exts return result where - basename = dots_to_slashes (moduleUserString mod) + basename = dots_to_slashes (moduleString mod) to_search :: [(FilePath, IO FinderCacheEntry)] to_search = [ (file, fn path basename) @@ -347,7 +347,7 @@ mkHomeModLocation2 :: DynFlags -> String -- Suffix -> IO ModLocation mkHomeModLocation2 dflags mod src_basename ext = do - let mod_basename = dots_to_slashes (moduleUserString mod) + let mod_basename = dots_to_slashes (moduleString mod) obj_fn <- mkObjPath dflags src_basename mod_basename hi_fn <- mkHiPath dflags src_basename mod_basename @@ -420,7 +420,7 @@ mkStubPaths dflags mod location = let stubdir = stubDir dflags - mod_basename = dots_to_slashes (moduleUserString mod) + mod_basename = dots_to_slashes (moduleString mod) src_basename = basenameOf (expectJust "mkStubPaths" (ml_hs_file location)) diff --git a/ghc/compiler/main/GHC.hs b/ghc/compiler/main/GHC.hs index 77cd9d4..7e0ec2f 100644 --- a/ghc/compiler/main/GHC.hs +++ b/ghc/compiler/main/GHC.hs @@ -628,7 +628,7 @@ load2 s@(Session ref) how_much mod_graph = do when (ghci_mode == BatchCompile && isJust ofile && not do_linking) $ debugTraceMsg dflags 1 (text ("Warning: output was redirected with -o, " ++ "but no output will be generated\n" ++ - "because there is no " ++ moduleUserString main_mod ++ " module.")) + "because there is no " ++ moduleString main_mod ++ " module.")) -- link everything together linkresult <- link ghci_mode dflags do_linking (hsc_HPT hsc_env1) diff --git a/ghc/compiler/main/HscTypes.lhs b/ghc/compiler/main/HscTypes.lhs index 20e84ab..48041c0 100644 --- a/ghc/compiler/main/HscTypes.lhs +++ b/ghc/compiler/main/HscTypes.lhs @@ -996,7 +996,7 @@ showModMsg use_object mod_summary char ')']) where mod = ms_mod mod_summary - mod_str = moduleUserString mod ++ hscSourceString (ms_hsc_src mod_summary) + mod_str = moduleString mod ++ hscSourceString (ms_hsc_src mod_summary) \end{code} diff --git a/ghc/compiler/nativeGen/PprMach.hs b/ghc/compiler/nativeGen/PprMach.hs index 69d6573..4392ae7 100644 --- a/ghc/compiler/nativeGen/PprMach.hs +++ b/ghc/compiler/nativeGen/PprMach.hs @@ -697,17 +697,11 @@ pprLabel :: CLabel -> Doc pprLabel lbl = pprGloblDecl lbl $$ (pprCLabel_asm lbl <> char ':') --- Assume we want to backslash-convert the string pprASCII str - = vcat (map do1 (str ++ [chr 0])) + = vcat (map do1 str) $$ do1 0 where - do1 :: Char -> Doc - do1 c = ptext SLIT("\t.byte\t0x") <> hshow (ord c) - - hshow :: Int -> Doc - hshow n | n >= 0 && n <= 255 - = char (tab !! (n `div` 16)) <> char (tab !! (n `mod` 16)) - tab = "0123456789ABCDEF" + do1 :: Word8 -> Doc + do1 w = ptext SLIT("\t.byte\t") <> int (fromIntegral w) pprAlign bytes = IF_ARCH_alpha(ptextSLIT(".align ") <> int pow2, diff --git a/ghc/compiler/ndpFlatten/FlattenMonad.hs b/ghc/compiler/ndpFlatten/FlattenMonad.hs index 43e804c..4540508 100644 --- a/ghc/compiler/ndpFlatten/FlattenMonad.hs +++ b/ghc/compiler/ndpFlatten/FlattenMonad.hs @@ -67,7 +67,6 @@ import Monad (mplus) import Panic (panic) import Outputable (Outputable(ppr), pprPanic) import UniqSupply (UniqSupply, splitUniqSupply, uniqFromSupply) -import OccName (UserFS) import Var (Var, idType) import Id (Id, mkSysLocal) import Name (Name) @@ -86,6 +85,7 @@ import PrimOp ( PrimOp(..) ) import PrelInfo ( primOpId ) import CoreSyn (Expr(..), Bind(..), CoreBndr, CoreExpr, CoreBind, mkApps) import CoreUtils (exprType) +import FastString (FastString) -- friends import NDPCoreUtils (parrElemTy) @@ -176,7 +176,7 @@ runFlatten hsc_env eps us m -- generate a new local variable whose name is based on the given lexeme and -- whose type is as specified in the second argument (EXPORTED) -- -newVar :: UserFS -> Type -> Flatten Var +newVar :: FastString -> Type -> Flatten Var newVar lexeme ty = Flatten $ \state -> let (us1, us2) = splitUniqSupply (us state) @@ -187,7 +187,7 @@ newVar lexeme ty = Flatten $ \state -> -- generate a non-recursive binding using a new binder whose name is derived -- from the given lexeme (EXPORTED) -- -mkBind :: UserFS -> CoreExpr -> Flatten (CoreBndr, CoreBind) +mkBind :: FastString -> CoreExpr -> Flatten (CoreBndr, CoreBind) mkBind lexeme e = do v <- newVar lexeme (exprType e) diff --git a/ghc/compiler/parser/Ctype.lhs b/ghc/compiler/parser/Ctype.lhs index dfdb94a..dbe4e9f 100644 --- a/ghc/compiler/parser/Ctype.lhs +++ b/ghc/compiler/parser/Ctype.lhs @@ -9,6 +9,7 @@ module Ctype , is_lower -- Char# -> Bool , is_upper -- Char# -> Bool , is_digit -- Char# -> Bool + , is_alphanum -- Char# -> Bool , is_hexdigit, is_octdigit , hexDigit, octDecDigit @@ -50,6 +51,7 @@ is_space = is_ctype cSpace is_lower = is_ctype cLower is_upper = is_ctype cUpper is_digit = is_ctype cDigit +is_alphanum = is_ctype (cLower+cUpper+cDigit) \end{code} Utils @@ -241,99 +243,99 @@ charType c = case c of '\158' -> 0 -- \236 '\159' -> 0 -- \237 '\160' -> cSpace -- - '\161' -> cAny + cSymbol -- ¡ - '\162' -> cAny + cSymbol -- ¢ - '\163' -> cAny + cSymbol -- £ - '\164' -> cAny + cSymbol -- ¤ - '\165' -> cAny + cSymbol -- ¥ - '\166' -> cAny + cSymbol -- ¦ - '\167' -> cAny + cSymbol -- § - '\168' -> cAny + cSymbol -- ¨ - '\169' -> cAny + cSymbol -- © - '\170' -> cAny + cSymbol -- ª - '\171' -> cAny + cSymbol -- « - '\172' -> cAny + cSymbol -- ¬ - '\173' -> cAny + cSymbol -- ­ - '\174' -> cAny + cSymbol -- ® - '\175' -> cAny + cSymbol -- ¯ - '\176' -> cAny + cSymbol -- ° - '\177' -> cAny + cSymbol -- ± - '\178' -> cAny + cSymbol -- ² - '\179' -> cAny + cSymbol -- ³ - '\180' -> cAny + cSymbol -- ´ - '\181' -> cAny + cSymbol -- µ - '\182' -> cAny + cSymbol -- ¶ - '\183' -> cAny + cSymbol -- · - '\184' -> cAny + cSymbol -- ¸ - '\185' -> cAny + cSymbol -- ¹ - '\186' -> cAny + cSymbol -- º - '\187' -> cAny + cSymbol -- » - '\188' -> cAny + cSymbol -- ¼ - '\189' -> cAny + cSymbol -- ½ - '\190' -> cAny + cSymbol -- ¾ - '\191' -> cAny + cSymbol -- ¿ - '\192' -> cAny + cIdent + cUpper -- À - '\193' -> cAny + cIdent + cUpper -- Á - '\194' -> cAny + cIdent + cUpper --  - '\195' -> cAny + cIdent + cUpper -- à - '\196' -> cAny + cIdent + cUpper -- Ä - '\197' -> cAny + cIdent + cUpper -- Å - '\198' -> cAny + cIdent + cUpper -- Æ - '\199' -> cAny + cIdent + cUpper -- Ç - '\200' -> cAny + cIdent + cUpper -- È - '\201' -> cAny + cIdent + cUpper -- É - '\202' -> cAny + cIdent + cUpper -- Ê - '\203' -> cAny + cIdent + cUpper -- Ë - '\204' -> cAny + cIdent + cUpper -- Ì - '\205' -> cAny + cIdent + cUpper -- Í - '\206' -> cAny + cIdent + cUpper -- Î - '\207' -> cAny + cIdent + cUpper -- Ï - '\208' -> cAny + cIdent + cUpper -- Ð - '\209' -> cAny + cIdent + cUpper -- Ñ - '\210' -> cAny + cIdent + cUpper -- Ò - '\211' -> cAny + cIdent + cUpper -- Ó - '\212' -> cAny + cIdent + cUpper -- Ô - '\213' -> cAny + cIdent + cUpper -- Õ - '\214' -> cAny + cIdent + cUpper -- Ö - '\215' -> cAny + cSymbol + cLower -- × - '\216' -> cAny + cIdent + cUpper -- Ø - '\217' -> cAny + cIdent + cUpper -- Ù - '\218' -> cAny + cIdent + cUpper -- Ú - '\219' -> cAny + cIdent + cUpper -- Û - '\220' -> cAny + cIdent + cUpper -- Ü - '\221' -> cAny + cIdent + cUpper -- Ý - '\222' -> cAny + cIdent + cUpper -- Þ - '\223' -> cAny + cIdent -- ß - '\224' -> cAny + cIdent + cLower -- à - '\225' -> cAny + cIdent + cLower -- á - '\226' -> cAny + cIdent + cLower -- â - '\227' -> cAny + cIdent + cLower -- ã - '\228' -> cAny + cIdent + cLower -- ä - '\229' -> cAny + cIdent + cLower -- å - '\230' -> cAny + cIdent + cLower -- æ - '\231' -> cAny + cIdent + cLower -- ç - '\232' -> cAny + cIdent + cLower -- è - '\233' -> cAny + cIdent + cLower -- é - '\234' -> cAny + cIdent + cLower -- ê - '\235' -> cAny + cIdent + cLower -- ë - '\236' -> cAny + cIdent + cLower -- ì - '\237' -> cAny + cIdent + cLower -- í - '\238' -> cAny + cIdent + cLower -- î - '\239' -> cAny + cIdent + cLower -- ï - '\240' -> cAny + cIdent + cLower -- ð - '\241' -> cAny + cIdent + cLower -- ñ - '\242' -> cAny + cIdent + cLower -- ò - '\243' -> cAny + cIdent + cLower -- ó - '\244' -> cAny + cIdent + cLower -- ô - '\245' -> cAny + cIdent + cLower -- õ - '\246' -> cAny + cIdent + cLower -- ö - '\247' -> cAny + cSymbol -- ÷ - '\248' -> cAny + cIdent -- ø - '\249' -> cAny + cIdent + cLower -- ù - '\250' -> cAny + cIdent + cLower -- ú - '\251' -> cAny + cIdent + cLower -- û - '\252' -> cAny + cIdent + cLower -- ü - '\253' -> cAny + cIdent + cLower -- ý - '\254' -> cAny + cIdent + cLower -- þ - '\255' -> cAny + cIdent + cLower -- ÿ + '\161' -> cAny + cSymbol -- ¡ + '\162' -> cAny + cSymbol -- ¢ + '\163' -> cAny + cSymbol -- £ + '\164' -> cAny + cSymbol -- ¤ + '\165' -> cAny + cSymbol -- Â¥ + '\166' -> cAny + cSymbol -- ¦ + '\167' -> cAny + cSymbol -- § + '\168' -> cAny + cSymbol -- ¨ + '\169' -> cAny + cSymbol -- © + '\170' -> cAny + cSymbol -- ª + '\171' -> cAny + cSymbol -- « + '\172' -> cAny + cSymbol -- ¬ + '\173' -> cAny + cSymbol -- ­ + '\174' -> cAny + cSymbol -- ® + '\175' -> cAny + cSymbol -- ¯ + '\176' -> cAny + cSymbol -- ° + '\177' -> cAny + cSymbol -- ± + '\178' -> cAny + cSymbol -- ² + '\179' -> cAny + cSymbol -- ³ + '\180' -> cAny + cSymbol -- ´ + '\181' -> cAny + cSymbol -- µ + '\182' -> cAny + cSymbol -- ¶ + '\183' -> cAny + cSymbol -- · + '\184' -> cAny + cSymbol -- ¸ + '\185' -> cAny + cSymbol -- ¹ + '\186' -> cAny + cSymbol -- º + '\187' -> cAny + cSymbol -- » + '\188' -> cAny + cSymbol -- ¼ + '\189' -> cAny + cSymbol -- ½ + '\190' -> cAny + cSymbol -- ¾ + '\191' -> cAny + cSymbol -- ¿ + '\192' -> cAny + cIdent + cUpper -- À + '\193' -> cAny + cIdent + cUpper -- Á + '\194' -> cAny + cIdent + cUpper --  + '\195' -> cAny + cIdent + cUpper -- à + '\196' -> cAny + cIdent + cUpper -- Ä + '\197' -> cAny + cIdent + cUpper -- Å + '\198' -> cAny + cIdent + cUpper -- Æ + '\199' -> cAny + cIdent + cUpper -- Ç + '\200' -> cAny + cIdent + cUpper -- È + '\201' -> cAny + cIdent + cUpper -- É + '\202' -> cAny + cIdent + cUpper -- Ê + '\203' -> cAny + cIdent + cUpper -- Ë + '\204' -> cAny + cIdent + cUpper -- Ì + '\205' -> cAny + cIdent + cUpper -- Í + '\206' -> cAny + cIdent + cUpper -- Î + '\207' -> cAny + cIdent + cUpper -- Ï + '\208' -> cAny + cIdent + cUpper -- Ð + '\209' -> cAny + cIdent + cUpper -- Ñ + '\210' -> cAny + cIdent + cUpper -- Ò + '\211' -> cAny + cIdent + cUpper -- Ó + '\212' -> cAny + cIdent + cUpper -- Ô + '\213' -> cAny + cIdent + cUpper -- Õ + '\214' -> cAny + cIdent + cUpper -- Ö + '\215' -> cAny + cSymbol + cLower -- × + '\216' -> cAny + cIdent + cUpper -- Ø + '\217' -> cAny + cIdent + cUpper -- Ù + '\218' -> cAny + cIdent + cUpper -- Ú + '\219' -> cAny + cIdent + cUpper -- Û + '\220' -> cAny + cIdent + cUpper -- Ü + '\221' -> cAny + cIdent + cUpper -- Ý + '\222' -> cAny + cIdent + cUpper -- Þ + '\223' -> cAny + cIdent -- ß + '\224' -> cAny + cIdent + cLower -- à + '\225' -> cAny + cIdent + cLower -- á + '\226' -> cAny + cIdent + cLower -- â + '\227' -> cAny + cIdent + cLower -- ã + '\228' -> cAny + cIdent + cLower -- ä + '\229' -> cAny + cIdent + cLower -- Ã¥ + '\230' -> cAny + cIdent + cLower -- æ + '\231' -> cAny + cIdent + cLower -- ç + '\232' -> cAny + cIdent + cLower -- è + '\233' -> cAny + cIdent + cLower -- é + '\234' -> cAny + cIdent + cLower -- ê + '\235' -> cAny + cIdent + cLower -- ë + '\236' -> cAny + cIdent + cLower -- ì + '\237' -> cAny + cIdent + cLower -- í + '\238' -> cAny + cIdent + cLower -- î + '\239' -> cAny + cIdent + cLower -- ï + '\240' -> cAny + cIdent + cLower -- ð + '\241' -> cAny + cIdent + cLower -- ñ + '\242' -> cAny + cIdent + cLower -- ò + '\243' -> cAny + cIdent + cLower -- ó + '\244' -> cAny + cIdent + cLower -- ô + '\245' -> cAny + cIdent + cLower -- õ + '\246' -> cAny + cIdent + cLower -- ö + '\247' -> cAny + cSymbol -- ÷ + '\248' -> cAny + cIdent -- ø + '\249' -> cAny + cIdent + cLower -- ù + '\250' -> cAny + cIdent + cLower -- ú + '\251' -> cAny + cIdent + cLower -- û + '\252' -> cAny + cIdent + cLower -- ü + '\253' -> cAny + cIdent + cLower -- ý + '\254' -> cAny + cIdent + cLower -- þ + '\255' -> cAny + cIdent + cLower -- ÿ \end{code} diff --git a/ghc/compiler/parser/Lexer.x b/ghc/compiler/parser/Lexer.x index eb00e90..38908a0 100644 --- a/ghc/compiler/parser/Lexer.x +++ b/ghc/compiler/parser/Lexer.x @@ -1,5 +1,5 @@ ----------------------------------------------------------------------------- --- (c) The University of Glasgow, 2003 +-- (c) The University of Glasgow, 2006 -- -- GHC's lexer. -- @@ -43,35 +43,38 @@ import Ctype import Util ( maybePrefixMatch, readRational ) import DATA_BITS -import Char +import Data.Char import Ratio --import TRACE } -$whitechar = [\ \t\n\r\f\v\xa0] +$unispace = \x05 +$whitechar = [\ \t\n\r\f\v\xa0 $unispace] $white_no_nl = $whitechar # \n $ascdigit = 0-9 -$unidigit = \x01 +$unidigit = \x03 +$decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~] -$unisymbol = \x02 +$unisymbol = \x04 $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\'] -$unilarge = \x03 +$unilarge = \x01 $asclarge = [A-Z \xc0-\xd6 \xd8-\xde] $large = [$asclarge $unilarge] -$unismall = \x04 +$unismall = \x02 $ascsmall = [a-z \xdf-\xf6 \xf8-\xff] $small = [$ascsmall $unismall \_] -$graphic = [$small $large $symbol $digit $special \:\"\'] +$unigraphic = \x06 +$graphic = [$small $large $symbol $digit $special $unigraphic \:\"\'] $octit = 0-7 -$hexit = [$digit A-F a-f] +$hexit = [$decdigit A-F a-f] $symchar = [$symbol \:] $nl = [\n\r] $idchar = [$small $large $digit \'] @@ -82,7 +85,7 @@ $idchar = [$small $large $digit \'] @varsym = $symbol $symchar* @consym = \: $symchar* -@decimal = $digit+ +@decimal = $decdigit+ @octal = $octit+ @hexadecimal = $hexit+ @exponent = [eE] [\-\+]? @decimal @@ -154,13 +157,13 @@ $white_no_nl+ ; -- single-line line pragmas, of the form -- # "" \n - $digit+ { setLine line_prag1a } + $decdigit+ { setLine line_prag1a } \" [$graphic \ ]* \" { setFile line_prag1b } .* { pop } -- Haskell-style line pragmas, of the form -- {-# LINE "" #-} - $digit+ { setLine line_prag2a } + $decdigit+ { setLine line_prag2a } \" [$graphic \ ]* \" { setFile line_prag2b } "#-}"|"-}" { pop } -- NOTE: accept -} at the end of a LINE pragma, for compatibility @@ -554,6 +557,13 @@ reservedSymsFM = listToUFM $ ,(">-", ITrarrowtail, bit arrowsBit) ,("-<<", ITLarrowtail, bit arrowsBit) ,(">>-", ITRarrowtail, bit arrowsBit) + +#if __GLASGOW_HASKELL__ >= 605 + ,("∀", ITforall, bit tvBit) + ,("→", ITrarrow, 0) + ,("←", ITlarrow, 0) + ,("⋯", ITdotdot, 0) +#endif ] -- ----------------------------------------------------------------------------- @@ -670,23 +680,29 @@ splitQualName :: StringBuffer -> Int -> (FastString,FastString) -- takes a StringBuffer and a length, and returns the module name -- and identifier parts of a qualified name. Splits at the *last* dot, -- because of hierarchical module names. -splitQualName orig_buf len = split orig_buf 0 0 +splitQualName orig_buf len = split orig_buf orig_buf where - split buf dot_off n - | n == len = done dot_off - | lookAhead buf n == '.' = split2 buf n (n+1) - | otherwise = split buf dot_off (n+1) + split buf dot_buf + | orig_buf `byteDiff` buf >= len = done dot_buf + | c == '.' = found_dot buf' + | otherwise = split buf' dot_buf + where + (c,buf') = nextChar buf -- careful, we might get names like M.... -- so, if the character after the dot is not upper-case, this is -- the end of the qualifier part. - split2 buf dot_off n - | isUpper (lookAhead buf n) = split buf dot_off (n+1) - | otherwise = done dot_off - - done dot_off = - (lexemeToFastString orig_buf dot_off, - lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1)) + found_dot buf -- buf points after the '.' + | isUpper c = split buf' buf + | otherwise = done buf + where + (c,buf') = nextChar buf + + done dot_buf = + (lexemeToFastString orig_buf (qual_size - 1), + lexemeToFastString dot_buf (len - qual_size)) + where + qual_size = orig_buf `byteDiff` dot_buf varid span buf len = case lookupUFM reservedWordsFM fs of @@ -726,19 +742,19 @@ tok_decimal span buf len = return (L span (ITinteger $! parseInteger buf len 10 octDecDigit)) tok_octal span buf len - = return (L span (ITinteger $! parseInteger (stepOnBy 2 buf) (len-2) 8 octDecDigit)) + = return (L span (ITinteger $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit)) tok_hexadecimal span buf len - = return (L span (ITinteger $! parseInteger (stepOnBy 2 buf) (len-2) 16 hexDigit)) + = return (L span (ITinteger $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit)) prim_decimal span buf len = return (L span (ITprimint $! parseInteger buf (len-1) 10 octDecDigit)) prim_octal span buf len - = return (L span (ITprimint $! parseInteger (stepOnBy 2 buf) (len-3) 8 octDecDigit)) + = return (L span (ITprimint $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit)) prim_hexadecimal span buf len - = return (L span (ITprimint $! parseInteger (stepOnBy 2 buf) (len-3) 16 hexDigit)) + = return (L span (ITprimint $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit)) tok_float str = ITrational $! readRational str prim_float str = ITprimfloat $! readRational str @@ -839,7 +855,7 @@ lex_string_tok span buf len = do lex_string :: String -> P Token lex_string s = do i <- getInput - case alexGetChar i of + case alexGetChar' i of Nothing -> lit_error Just ('"',i) -> do @@ -848,14 +864,15 @@ lex_string s = do if glaexts then do i <- getInput - case alexGetChar i of + case alexGetChar' i of Just ('#',i) -> do setInput i if any (> '\xFF') s then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'" - else let s' = mkFastStringNarrow (reverse s) in - -- always a narrow string/byte array + else let s' = mkZFastString (reverse s) in return (ITprimstring s') + -- mkZFastString is a hack to avoid encoding the + -- string in UTF-8. We just want the exact bytes. _other -> return (ITstring (mkFastString (reverse s))) else @@ -866,11 +883,11 @@ lex_string s = do setInput i; lex_string s | Just (c,i) <- next, is_space c -> do setInput i; lex_stringgap s - where next = alexGetChar i + where next = alexGetChar' i - Just _ -> do - c <- lex_char - lex_string (c:s) + Just (c, i) -> do + c' <- lex_char c i + lex_string (c':s) lex_stringgap s = do c <- getCharOrFail @@ -890,7 +907,7 @@ lex_char_tok :: Action lex_char_tok span buf len = do -- We've seen ' i1 <- getInput -- Look ahead to first character let loc = srcSpanStart span - case alexGetChar i1 of + case alexGetChar' i1 of Nothing -> lit_error Just ('\'', i2@(AI end2 _ _)) -> do -- We've seen '' @@ -905,14 +922,15 @@ lex_char_tok span buf len = do -- We've seen ' lit_ch <- lex_escape mc <- getCharOrFail -- Trailing quote if mc == '\'' then finish_char_tok loc lit_ch - else lit_error + else do setInput i2; lit_error - Just (c, i2@(AI end2 _ _)) | not (is_any c) -> lit_error - | otherwise -> + Just (c, i2@(AI end2 _ _)) + | not (isAny c) -> lit_error + | otherwise -> -- We've seen 'x, where x is a valid character -- (i.e. not newline etc) but not a quote or backslash - case alexGetChar i2 of -- Look ahead one more character + case alexGetChar' i2 of -- Look ahead one more character Nothing -> lit_error Just ('\'', i3) -> do -- We've seen 'x' setInput i3 @@ -922,7 +940,7 @@ lex_char_tok span buf len = do -- We've seen ' th_exts <- extension thEnabled let (AI end _ _) = i1 if th_exts then return (L (mkSrcSpan loc end) ITvarQuote) - else lit_error + else do setInput i2; lit_error finish_char_tok :: SrcLoc -> Char -> P (Located Token) finish_char_tok loc ch -- We've already seen the closing quote @@ -930,7 +948,7 @@ finish_char_tok loc ch -- We've already seen the closing quote = do glaexts <- extension glaExtsEnabled i@(AI end _ _) <- getInput if glaexts then do - case alexGetChar i of + case alexGetChar' i of Just ('#',i@(AI end _ _)) -> do setInput i return (L (mkSrcSpan loc end) (ITprimchar ch)) @@ -939,14 +957,16 @@ finish_char_tok loc ch -- We've already seen the closing quote else do return (L (mkSrcSpan loc end) (ITchar ch)) -lex_char :: P Char -lex_char = do - mc <- getCharOrFail - case mc of - '\\' -> lex_escape - c | is_any c -> return c +lex_char :: Char -> AlexInput -> P Char +lex_char c inp = do + case c of + '\\' -> do setInput inp; lex_escape + c | isAny c -> do setInput inp; return c _other -> lit_error +isAny c | c > '\xff' = isPrint c + | otherwise = is_any c + lex_escape :: P Char lex_escape = do c <- getCharOrFail @@ -972,11 +992,11 @@ lex_escape = do c1 -> do i <- getInput - case alexGetChar i of + case alexGetChar' i of Nothing -> lit_error Just (c2,i2) -> - case alexGetChar i2 of - Nothing -> lit_error + case alexGetChar' i2 of + Nothing -> do setInput i2; lit_error Just (c3,i3) -> let str = [c1,c2,c3] in case [ (c,rest) | (p,c) <- silly_escape_chars, @@ -991,22 +1011,22 @@ lex_escape = do readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char readNum is_digit base conv = do + i <- getInput c <- getCharOrFail if is_digit c then readNum2 is_digit base conv (conv c) - else lit_error + else do setInput i; lit_error readNum2 is_digit base conv i = do input <- getInput read i input where read i input = do - case alexGetChar input of + case alexGetChar' input of Just (c,input') | is_digit c -> do read (i*base + conv c) input' _other -> do - setInput input if i >= 0 && i <= 0x10FFFF - then return (chr i) + then do setInput input; return (chr i) else lit_error silly_escape_chars = [ @@ -1046,12 +1066,16 @@ silly_escape_chars = [ ("DEL", '\DEL') ] +-- before calling lit_error, ensure that the current input is pointing to +-- the position of the error in the buffer. This is so that we can report +-- a correct location to the user, but also so we can detect UTF-8 decoding +-- errors if they occur. lit_error = lexError "lexical error in string/character literal" getCharOrFail :: P Char getCharOrFail = do i <- getInput - case alexGetChar i of + case alexGetChar' i of Nothing -> lexError "unexpected end-of-file in string/character literal" Just (c,i) -> do setInput i; return c @@ -1134,21 +1158,74 @@ setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } () data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (AI _ _ s) = prevChar s '\n' +alexInputPrevChar (AI _ _ buf) = prevChar buf '\n' alexGetChar :: AlexInput -> Maybe (Char,AlexInput) alexGetChar (AI loc ofs s) | atEnd s = Nothing - | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq` Just (c, (AI loc' ofs' s')) - where c = currentChar s - loc' = advanceSrcLoc loc c - ofs' = advanceOffs c ofs - s' = stepOn s + | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq` + Just (adj_c, (AI loc' ofs' s')) + where (c,s') = nextChar s + loc' = advanceSrcLoc loc c + ofs' = advanceOffs c ofs + + non_graphic = '\x0' + upper = '\x1' + lower = '\x2' + digit = '\x3' + symbol = '\x4' + space = '\x5' + other_graphic = '\x6' + + adj_c +#if __GLASGOW_HASKELL__ < 605 + = c -- no Unicode support +#else + | c <= '\x04' = non_graphic + | c <= '\xff' = c + | otherwise = + case generalCategory c of + UppercaseLetter -> upper + LowercaseLetter -> lower + TitlecaseLetter -> upper + ModifierLetter -> other_graphic + OtherLetter -> other_graphic + NonSpacingMark -> other_graphic + SpacingCombiningMark -> other_graphic + EnclosingMark -> other_graphic + DecimalNumber -> digit + LetterNumber -> other_graphic + OtherNumber -> other_graphic + ConnectorPunctuation -> other_graphic + DashPunctuation -> other_graphic + OpenPunctuation -> other_graphic + ClosePunctuation -> other_graphic + InitialQuote -> other_graphic + FinalQuote -> other_graphic + OtherPunctuation -> other_graphic + MathSymbol -> symbol + CurrencySymbol -> symbol + ModifierSymbol -> symbol + OtherSymbol -> symbol + Space -> space + _other -> non_graphic +#endif + +-- This version does not squash unicode characters, it is used when +-- lexing strings. +alexGetChar' :: AlexInput -> Maybe (Char,AlexInput) +alexGetChar' (AI loc ofs s) + | atEnd s = Nothing + | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq` + Just (c, (AI loc' ofs' s')) + where (c,s') = nextChar s + loc' = advanceSrcLoc loc c + ofs' = advanceOffs c ofs - advanceOffs :: Char -> Int -> Int - advanceOffs '\n' offs = 0 - advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8 - advanceOffs _ offs = offs + 1 +advanceOffs :: Char -> Int -> Int +advanceOffs '\n' offs = 0 +advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8 +advanceOffs _ offs = offs + 1 getInput :: P AlexInput getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b) @@ -1255,7 +1332,7 @@ srcParseErr buf len else hcat [ptext SLIT("parse error on input "), char '`', text token, char '\''] ] - where token = lexemeToString (stepOnBy (-len) buf) len + where token = lexemeToString (offsetBytes (-len) buf) len -- Report a parse failure, giving the span of the previous token as -- the location of the error. This is the entry point for errors @@ -1266,14 +1343,12 @@ srcParseFail = P $ \PState{ buffer = buf, last_len = len, PFailed last_loc (srcParseErr buf len) -- A lexical error is reported at a particular position in the source file, --- not over a token range. TODO: this is slightly wrong, because we record --- the error at the character position following the one which caused the --- error. We should somehow back up by one character. +-- not over a token range. lexError :: String -> P a lexError str = do loc <- getSrcLoc - i@(AI end _ _) <- getInput - failLocMsgP loc end str + i@(AI end _ buf) <- getInput + reportLexError loc end buf False str -- ----------------------------------------------------------------------------- -- This is the top-level function: called from the parser each time a @@ -1282,7 +1357,7 @@ lexError str = do lexer :: (Located Token -> P a) -> P a lexer cont = do tok@(L _ tok__) <- lexToken - -- trace ("token: " ++ show tok__) $ do + --trace ("token: " ++ show tok__) $ do cont tok lexToken :: P (Located Token) @@ -1294,13 +1369,24 @@ lexToken = do AlexEOF -> do let span = mkSrcSpan loc1 loc1 setLastToken span 0 return (L span ITeof) - AlexError (AI loc2 _ _) -> do failLocMsgP loc1 loc2 "lexical error" + AlexError (AI loc2 _ buf) -> do + reportLexError loc1 loc2 buf True "lexical error" AlexSkip inp2 _ -> do setInput inp2 lexToken AlexToken inp2@(AI end _ buf2) len t -> do setInput inp2 let span = mkSrcSpan loc1 end - span `seq` setLastToken span len - t span buf len + let bytes = byteDiff buf buf2 + span `seq` setLastToken span bytes + t span buf bytes + +reportLexError loc1 loc2 buf is_prev str = + let + c | is_prev = prevChar buf '\0' + | otherwise = fst (nextChar buf) + in + if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar# + then failLocMsgP loc2 loc2 "UTF-8 decoding error" + else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c) } diff --git a/ghc/compiler/parser/Parser.y.pp b/ghc/compiler/parser/Parser.y.pp index 844cc86..b4acb89 100644 --- a/ghc/compiler/parser/Parser.y.pp +++ b/ghc/compiler/parser/Parser.y.pp @@ -25,7 +25,7 @@ import Type ( funTyCon ) import ForeignCall ( Safety(..), CExportSpec(..), CLabelString, CCallConv(..), CCallTarget(..), defaultCCallConv ) -import OccName ( UserFS, varName, dataName, tcClsName, tvName ) +import OccName ( varName, dataName, tcClsName, tvName ) import DataCon ( DataCon, dataConName ) import SrcLoc ( Located(..), unLoc, getLoc, noLoc, combineSrcSpans, SrcSpan, combineLocs, srcLocFile, @@ -1469,7 +1469,7 @@ varsym_no_minus :: { Located RdrName } -- varsym not including '-' -- These special_ids are treated as keywords in various places, -- but as ordinary ids elsewhere. 'special_id' collects all these -- except 'unsafe' and 'forall' whose treatment differs depending on context -special_id :: { Located UserFS } +special_id :: { Located FastString } special_id : 'as' { L1 FSLIT("as") } | 'qualified' { L1 FSLIT("qualified") } @@ -1480,7 +1480,7 @@ special_id | 'stdcall' { L1 FSLIT("stdcall") } | 'ccall' { L1 FSLIT("ccall") } -special_sym :: { Located UserFS } +special_sym :: { Located FastString } special_sym : '!' { L1 FSLIT("!") } | '.' { L1 FSLIT(".") } | '*' { L1 FSLIT("*") } diff --git a/ghc/compiler/parser/ParserCore.y b/ghc/compiler/parser/ParserCore.y index d8fceeb..3210583 100644 --- a/ghc/compiler/parser/ParserCore.y +++ b/ghc/compiler/parser/ParserCore.y @@ -72,7 +72,7 @@ module :: { HsExtCore RdrName } : '%module' modid tdefs vdefgs { HsExtCore $2 $3 $4 } modid :: { Module } - : CNAME { mkSysModuleFS (mkFastString $1) } + : CNAME { mkModuleFS (mkFastString $1) } ------------------------------------------------------------- -- Type and newtype declarations are in HsSyn syntax @@ -262,25 +262,25 @@ lit :: { Literal } | '(' STRING '::' aty ')' { MachStr (mkFastString $2) } tv_occ :: { OccName } - : NAME { mkSysOcc tvName $1 } + : NAME { mkOccName tvName $1 } var_occ :: { OccName } - : NAME { mkSysOcc varName $1 } + : NAME { mkVarOcc $1 } -- Type constructor q_tc_name :: { IfaceExtName } - : modid '.' CNAME { ExtPkg $1 (mkSysOcc tcName $3) } + : modid '.' CNAME { ExtPkg $1 (mkOccName tcName $3) } -- Data constructor in a pattern or data type declaration; use the dataName, -- because that's what we expect in Core case patterns d_pat_occ :: { OccName } - : CNAME { mkSysOcc dataName $1 } + : CNAME { mkOccName dataName $1 } -- Data constructor occurrence in an expression; -- use the varName because that's the worker Id d_occ :: { OccName } - : CNAME { mkSysOcc varName $1 } + : CNAME { mkVarOcc $1 } { diff --git a/ghc/compiler/parser/RdrHsSyn.lhs b/ghc/compiler/parser/RdrHsSyn.lhs index 2d18d6d..6ff15e7 100644 --- a/ghc/compiler/parser/RdrHsSyn.lhs +++ b/ghc/compiler/parser/RdrHsSyn.lhs @@ -59,7 +59,7 @@ import TysWiredIn ( unitTyCon ) import ForeignCall ( CCallConv, Safety, CCallTarget(..), CExportSpec(..), DNCallSpec(..), DNKind(..), CLabelString ) import OccName ( srcDataName, varName, isDataOcc, isTcOcc, - occNameUserString ) + occNameString ) import SrcLoc import OrdList ( OrdList, fromOL ) import Bag ( Bag, emptyBag, snocBag, consBag, foldrBag ) @@ -800,8 +800,8 @@ mkExport :: CallConv mkExport (CCall cconv) (L loc entity, v, ty) = return $ ForD (ForeignExport v ty (CExport (CExportStatic entity' cconv)) False) where - entity' | nullFastString entity = mkExtName (unLoc v) - | otherwise = entity + entity' | nullFS entity = mkExtName (unLoc v) + | otherwise = entity mkExport DNCall (L loc entity, v, ty) = parseError (getLoc v){-TODO: not quite right-} "Foreign export is not yet supported for .NET" @@ -811,10 +811,9 @@ mkExport DNCall (L loc entity, v, ty) = -- of the Haskell name is then performed, so if you foreign export (++), -- it's external name will be "++". Too bad; it's important because we don't -- want z-encoding (e.g. names with z's in them shouldn't be doubled) --- (This is why we use occNameUserString.) -- mkExtName :: RdrName -> CLabelString -mkExtName rdrNm = mkFastString (occNameUserString (rdrNameOcc rdrNm)) +mkExtName rdrNm = mkFastString (occNameString (rdrNameOcc rdrNm)) \end{code} diff --git a/ghc/compiler/prelude/PrelNames.lhs b/ghc/compiler/prelude/PrelNames.lhs index 0d99121..eb26d34 100644 --- a/ghc/compiler/prelude/PrelNames.lhs +++ b/ghc/compiler/prelude/PrelNames.lhs @@ -50,9 +50,8 @@ module PrelNames ( #include "HsVersions.h" import Module ( Module, mkModule ) -import OccName ( dataName, tcName, clsName, varName, mkOccFS - ) - +import OccName ( dataName, tcName, clsName, varName, mkOccNameFS, + mkVarOccFS ) import RdrName ( RdrName, nameRdrName, mkOrig, rdrNameOcc, mkUnqual ) import Unique ( Unique, Uniquable(..), hasKey, mkPreludeMiscIdUnique, mkPreludeDataConUnique, @@ -75,7 +74,7 @@ import FastString This *local* name is used by the interactive stuff \begin{code} -itName uniq = mkInternalName uniq (mkOccFS varName FSLIT("it")) noSrcLoc +itName uniq = mkInternalName uniq (mkOccNameFS varName FSLIT("it")) noSrcLoc \end{code} \begin{code} @@ -415,10 +414,10 @@ inrDataCon_RDR = dataQual_RDR pREL_BASE FSLIT("Inr") genUnitDataCon_RDR = dataQual_RDR pREL_BASE FSLIT("Unit") ---------------------- -varQual_RDR mod str = mkOrig mod (mkOccFS varName str) -tcQual_RDR mod str = mkOrig mod (mkOccFS tcName str) -clsQual_RDR mod str = mkOrig mod (mkOccFS clsName str) -dataQual_RDR mod str = mkOrig mod (mkOccFS dataName str) +varQual_RDR mod str = mkOrig mod (mkOccNameFS varName str) +tcQual_RDR mod str = mkOrig mod (mkOccNameFS tcName str) +clsQual_RDR mod str = mkOrig mod (mkOccNameFS clsName str) +dataQual_RDR mod str = mkOrig mod (mkOccNameFS dataName str) \end{code} %************************************************************************ @@ -656,17 +655,17 @@ tcQual = mk_known_key_name tcName clsQual = mk_known_key_name clsName mk_known_key_name space mod str uniq - = mkExternalName uniq mod (mkOccFS space str) + = mkExternalName uniq mod (mkOccNameFS space str) Nothing noSrcLoc conName :: Name -> FastString -> Unique -> Name conName tycon occ uniq - = mkExternalName uniq (nameModule tycon) (mkOccFS dataName occ) + = mkExternalName uniq (nameModule tycon) (mkOccNameFS dataName occ) (Just tycon) noSrcLoc methName :: Name -> FastString -> Unique -> Name methName cls occ uniq - = mkExternalName uniq (nameModule cls) (mkOccFS varName occ) + = mkExternalName uniq (nameModule cls) (mkVarOccFS occ) (Just cls) noSrcLoc \end{code} diff --git a/ghc/compiler/prelude/PrelRules.lhs b/ghc/compiler/prelude/PrelRules.lhs index e0b2347..9cdddc9 100644 --- a/ghc/compiler/prelude/PrelRules.lhs +++ b/ghc/compiler/prelude/PrelRules.lhs @@ -36,7 +36,7 @@ import TyCon ( tyConDataCons_maybe, isEnumerationTyCon, isNewTyCon ) import DataCon ( dataConTag, dataConTyCon, dataConWorkId, fIRST_TAG ) import CoreUtils ( cheapEqExpr, exprIsConApp_maybe ) import Type ( tyConAppTyCon, coreEqType ) -import OccName ( occNameUserString) +import OccName ( occNameFS ) import PrelNames ( unpackCStringFoldrName, unpackCStringFoldrIdKey, hasKey, eqStringName, unpackCStringIdKey ) import Maybes ( orElse ) @@ -58,7 +58,7 @@ import DATA_WORD ( Word64 ) primOpRules :: PrimOp -> Name -> [CoreRule] primOpRules op op_name = primop_rule op where - rule_name = mkFastString (occNameUserString (primOpOcc op)) + rule_name = occNameFS (primOpOcc op) rule_name_case = rule_name `appendFS` FSLIT("->case") -- A useful shorthand diff --git a/ghc/compiler/prelude/PrimOp.lhs b/ghc/compiler/prelude/PrimOp.lhs index e99eb9d..a650352 100644 --- a/ghc/compiler/prelude/PrimOp.lhs +++ b/ghc/compiler/prelude/PrimOp.lhs @@ -22,7 +22,7 @@ import TysWiredIn import NewDemand import Var ( TyVar ) -import OccName ( OccName, pprOccName, mkVarOcc ) +import OccName ( OccName, pprOccName, mkVarOccFS ) import TyCon ( TyCon, isPrimTyCon, tyConPrimRep, PrimRep(..) ) import Type ( Type, mkForAllTys, mkFunTy, mkFunTys, tyConAppTyCon, typePrimRep ) @@ -113,10 +113,10 @@ data PrimOpInfo [Type] Type -mkDyadic str ty = Dyadic (mkVarOcc str) ty -mkMonadic str ty = Monadic (mkVarOcc str) ty -mkCompare str ty = Compare (mkVarOcc str) ty -mkGenPrimOp str tvs tys ty = GenPrimOp (mkVarOcc str) tvs tys ty +mkDyadic str ty = Dyadic (mkVarOccFS str) ty +mkMonadic str ty = Monadic (mkVarOccFS str) ty +mkCompare str ty = Compare (mkVarOccFS str) ty +mkGenPrimOp str tvs tys ty = GenPrimOp (mkVarOccFS str) tvs tys ty \end{code} %************************************************************************ diff --git a/ghc/compiler/prelude/TysPrim.lhs b/ghc/compiler/prelude/TysPrim.lhs index 7d397d6..2f6168b 100644 --- a/ghc/compiler/prelude/TysPrim.lhs +++ b/ghc/compiler/prelude/TysPrim.lhs @@ -46,7 +46,7 @@ module TysPrim( import Var ( TyVar, mkTyVar ) import Name ( Name, BuiltInSyntax(..), mkInternalName, mkWiredInName ) -import OccName ( mkOccFS, tcName, mkTyVarOcc ) +import OccName ( mkOccNameFS, tcName, mkTyVarOcc ) import TyCon ( TyCon, ArgVrcs, mkPrimTyCon, mkLiftedPrimTyCon, PrimRep(..) ) import Type ( mkTyConApp, mkTyConTy, mkTyVarTys, mkTyVarTy, @@ -100,7 +100,7 @@ primTyCons mkPrimTc :: FastString -> Unique -> TyCon -> Name mkPrimTc fs uniq tycon - = mkWiredInName gHC_PRIM (mkOccFS tcName fs) + = mkWiredInName gHC_PRIM (mkOccNameFS tcName fs) uniq Nothing -- No parent object (ATyCon tycon) -- Relevant TyCon diff --git a/ghc/compiler/prelude/TysWiredIn.lhs b/ghc/compiler/prelude/TysWiredIn.lhs index e7dea60..ceb4df5 100644 --- a/ghc/compiler/prelude/TysWiredIn.lhs +++ b/ghc/compiler/prelude/TysWiredIn.lhs @@ -62,16 +62,18 @@ import Module ( Module ) import RdrName ( nameRdrName ) import Name ( Name, BuiltInSyntax(..), nameUnique, nameOccName, nameModule, mkWiredInName ) -import OccName ( mkOccFS, tcName, dataName, mkTupleOcc, mkDataConWorkerOcc ) +import OccName ( mkOccNameFS, tcName, dataName, mkTupleOcc, + mkDataConWorkerOcc ) import DataCon ( DataCon, mkDataCon, dataConWorkId, dataConSourceArity ) import Var ( TyVar, tyVarKind ) import TyCon ( TyCon, AlgTyConRhs(DataTyCon), tyConDataCons, - mkTupleTyCon, mkAlgTyCon, tyConName - ) + mkTupleTyCon, mkAlgTyCon, tyConName ) -import BasicTypes ( Arity, RecFlag(..), Boxity(..), isBoxed, StrictnessMark(..) ) +import BasicTypes ( Arity, RecFlag(..), Boxity(..), isBoxed, + StrictnessMark(..) ) -import Type ( Type, mkTyConTy, mkTyConApp, mkTyVarTy, mkTyVarTys, TyThing(..) ) +import Type ( Type, mkTyConTy, mkTyConApp, mkTyVarTy, mkTyVarTys, + TyThing(..) ) import Kind ( mkArrowKinds, liftedTypeKind, ubxTupleKind ) import Unique ( incrUnique, mkTupleTyConUnique, mkTupleDataConUnique, mkPArrDataConUnique ) @@ -114,14 +116,14 @@ wiredInTyCons = [ unitTyCon -- Not treated like other tuples, because \begin{code} mkWiredInTyConName :: BuiltInSyntax -> Module -> FastString -> Unique -> TyCon -> Name mkWiredInTyConName built_in mod fs uniq tycon - = mkWiredInName mod (mkOccFS tcName fs) uniq + = mkWiredInName mod (mkOccNameFS tcName fs) uniq Nothing -- No parent object (ATyCon tycon) -- Relevant TyCon built_in mkWiredInDataConName :: BuiltInSyntax -> Module -> FastString -> Unique -> DataCon -> Name -> Name mkWiredInDataConName built_in mod fs uniq datacon parent - = mkWiredInName mod (mkOccFS dataName fs) uniq + = mkWiredInName mod (mkOccNameFS dataName fs) uniq (Just parent) -- Name of parent TyCon (ADataCon datacon) -- Relevant DataCon built_in @@ -535,7 +537,7 @@ mkPArrFakeCon arity = data_con tyvar = head alphaTyVars tyvarTys = replicate arity $ mkTyVarTy tyvar nameStr = mkFastString ("MkPArr" ++ show arity) - name = mkWiredInName pREL_PARR (mkOccFS dataName nameStr) uniq + name = mkWiredInName pREL_PARR (mkOccNameFS dataName nameStr) uniq Nothing (ADataCon data_con) UserSyntax uniq = mkPArrDataConUnique arity diff --git a/ghc/compiler/profiling/CostCentre.lhs b/ghc/compiler/profiling/CostCentre.lhs index 3616ccb..f4a6ba9 100644 --- a/ghc/compiler/profiling/CostCentre.lhs +++ b/ghc/compiler/profiling/CostCentre.lhs @@ -32,9 +32,7 @@ module CostCentre ( #include "HsVersions.h" import Var ( Id ) -import Name ( UserFS, EncodedFS, encodeFS, decode, - getOccName, occNameFS - ) +import Name ( getOccName, occNameFS ) import Module ( Module ) import Outputable import FastTypes @@ -120,7 +118,7 @@ data CostCentre cc_mod :: Module -- Name of module defining this CC. } -type CcName = EncodedFS +type CcName = FastString data IsDupdCC = OriginalCC -- This says how the CC is *used*. Saying that @@ -200,9 +198,9 @@ maybeSingletonCCS _ = Nothing Building cost centres \begin{code} -mkUserCC :: UserFS -> Module -> CostCentre +mkUserCC :: FastString -> Module -> CostCentre mkUserCC cc_name mod - = NormalCC { cc_name = encodeFS cc_name, cc_mod = mod, + = NormalCC { cc_name = cc_name, cc_mod = mod, cc_is_dupd = OriginalCC, cc_is_caf = NotCafCC {-might be changed-} } @@ -370,5 +368,5 @@ ppCostCentreLbl (NormalCC {cc_name = n, cc_mod = m, cc_is_caf = is_caf}) costCentreUserName (NoCostCentre) = "NO_CC" costCentreUserName (AllCafsCC {}) = "CAF" costCentreUserName cc@(NormalCC {cc_name = name, cc_is_caf = is_caf}) - = case is_caf of { CafCC -> "CAF:"; _ -> "" } ++ decode (unpackFS name) + = case is_caf of { CafCC -> "CAF:"; _ -> "" } ++ unpackFS name \end{code} diff --git a/ghc/compiler/rename/RnEnv.lhs b/ghc/compiler/rename/RnEnv.lhs index 99d6a34..2be3bfd 100644 --- a/ghc/compiler/rename/RnEnv.lhs +++ b/ghc/compiler/rename/RnEnv.lhs @@ -50,7 +50,8 @@ import TcRnMonad import Name ( Name, nameIsLocalOrFrom, mkInternalName, isWiredInName, nameSrcLoc, nameOccName, nameModule, nameParent, isExternalName ) import NameSet -import OccName ( tcName, isDataOcc, occNameFlavour, reportIfUnused ) +import OccName ( tcName, isDataOcc, pprNonVarNameSpace, occNameSpace, + reportIfUnused ) import Module ( Module ) import PrelNames ( mkUnboundName, rOOT_MAIN, iNTERACTIVE, consDataConKey, hasKey ) import UniqSupply @@ -747,7 +748,8 @@ warnUnusedName :: (Name, Maybe Provenance) -> RnM () warnUnusedName (name, prov) = addWarnAt loc $ sep [msg <> colon, - nest 2 $ occNameFlavour (nameOccName name) <+> quotes (ppr name)] + nest 2 $ pprNonVarNameSpace (occNameSpace (nameOccName name)) + <+> quotes (ppr name)] -- TODO should be a proper span where (loc,msg) = case prov of @@ -778,7 +780,8 @@ shadowedNameWarn doc shadow unknownNameErr rdr_name = sep [ptext SLIT("Not in scope:"), - nest 2 $ occNameFlavour (rdrNameOcc rdr_name) <+> quotes (ppr rdr_name)] + nest 2 $ pprNonVarNameSpace (occNameSpace (rdrNameOcc rdr_name)) + <+> quotes (ppr rdr_name)] unknownInstBndrErr cls op = quotes (ppr op) <+> ptext SLIT("is not a (visible) method of class") <+> quotes (ppr cls) diff --git a/ghc/compiler/rename/RnExpr.lhs b/ghc/compiler/rename/RnExpr.lhs index 53a412f..95d7b83 100644 --- a/ghc/compiler/rename/RnExpr.lhs +++ b/ghc/compiler/rename/RnExpr.lhs @@ -37,7 +37,6 @@ import Name ( Name, nameOccName, nameIsLocalOrFrom ) import NameSet import RdrName ( RdrName, emptyGlobalRdrEnv, extendLocalRdrEnv, lookupLocalRdrEnv ) import LoadIface ( loadHomeInterface ) -import UnicodeUtil ( stringToUtf8 ) import UniqFM ( isNullUFM ) import UniqSet ( emptyUniqSet ) import List ( nub ) @@ -932,7 +931,7 @@ mkAssertErrorExpr = getSrcSpanM `thenM` \ sloc -> let expr = HsApp (L sloc (HsVar assertErrorName)) (L sloc (HsLit msg)) - msg = HsStringPrim (mkFastString (stringToUtf8 (showSDoc (ppr sloc)))) + msg = HsStringPrim (mkFastString (showSDoc (ppr sloc))) in returnM (expr, emptyFVs) \end{code} diff --git a/ghc/compiler/rename/RnNames.lhs b/ghc/compiler/rename/RnNames.lhs index bf6e54a..4cdb241 100644 --- a/ghc/compiler/rename/RnNames.lhs +++ b/ghc/compiler/rename/RnNames.lhs @@ -26,15 +26,17 @@ import TcRnMonad import FiniteMap import PrelNames ( pRELUDE, isUnboundName, main_RDR_Unqual ) -import Module ( Module, moduleUserString, unitModuleEnv, +import Module ( Module, moduleString, unitModuleEnv, lookupModuleEnv, moduleEnvElts, foldModuleEnv ) import Name ( Name, nameSrcLoc, nameOccName, nameModule, isWiredInName, nameParent, nameParent_maybe, isExternalName, isBuiltInSyntax ) import NameSet import NameEnv -import OccName ( srcDataName, isTcOcc, occNameFlavour, OccEnv, - mkOccEnv, lookupOccEnv, emptyOccEnv, extendOccEnv ) +import OccName ( srcDataName, isTcOcc, pprNonVarNameSpace, + occNameSpace, + OccEnv, mkOccEnv, lookupOccEnv, emptyOccEnv, + extendOccEnv ) import HscTypes ( GenAvailInfo(..), AvailInfo, HomePackageTable, PackageIfaceTable, unQualInScope, @@ -683,7 +685,7 @@ reportDeprecations tcg_env , Just deprec_txt <- lookupDeprec hpt pit name = setSrcSpan (importSpecLoc imp_spec) $ addWarn (sep [ptext SLIT("Deprecated use of") <+> - occNameFlavour (nameOccName name) <+> + pprNonVarNameSpace (occNameSpace (nameOccName name)) <+> quotes (ppr name), (parens imp_msg) <> colon, (ppr deprec_txt) ]) @@ -958,7 +960,7 @@ printMinimalImports imps (vcat (map ppr_mod_ie mod_ies)) }) } where - mkFilename this_mod = moduleUserString this_mod ++ ".imports" + mkFilename this_mod = moduleString this_mod ++ ".imports" ppr_mod_ie (mod_name, ies) | mod_name == pRELUDE = empty diff --git a/ghc/compiler/simplCore/SetLevels.lhs b/ghc/compiler/simplCore/SetLevels.lhs index 4b1c01d..f8ab29d 100644 --- a/ghc/compiler/simplCore/SetLevels.lhs +++ b/ghc/compiler/simplCore/SetLevels.lhs @@ -60,8 +60,8 @@ import CoreUtils ( exprType, exprIsTrivial, exprIsCheap, mkPiTypes ) import CoreFVs -- all of it import CoreSubst ( Subst, emptySubst, extendInScope, extendIdSubst, cloneIdBndr, cloneRecIdBndrs ) -import Id ( Id, idType, mkSysLocalUnencoded, - isOneShotLambda, zapDemandIdInfo, +import Id ( Id, idType, mkSysLocal, isOneShotLambda, + zapDemandIdInfo, idSpecialisation, idWorkerInfo, setIdInfo ) import IdInfo ( workerExists, vanillaIdInfo, isEmptySpecInfo ) @@ -69,7 +69,7 @@ import Var ( Var ) import VarSet import VarEnv import Name ( getOccName ) -import OccName ( occNameUserString ) +import OccName ( occNameString ) import Type ( isUnLiftedType, Type ) import BasicTypes ( TopLevelFlag(..) ) import UniqSupply @@ -796,9 +796,9 @@ newPolyBndrs dest_lvl env abs_vars bndrs in returnLvl (extendPolyLvlEnv dest_lvl env abs_vars (bndrs `zip` new_bndrs), new_bndrs) where - mk_poly_bndr bndr uniq = mkSysLocalUnencoded (mkFastString str) uniq poly_ty + mk_poly_bndr bndr uniq = mkSysLocal (mkFastString str) uniq poly_ty where - str = "poly_" ++ occNameUserString (getOccName bndr) + str = "poly_" ++ occNameString (getOccName bndr) poly_ty = mkPiTypes abs_vars (idType bndr) @@ -807,7 +807,7 @@ newLvlVar :: String -> LvlM Id newLvlVar str vars body_ty = getUniqueUs `thenLvl` \ uniq -> - returnUs (mkSysLocalUnencoded (mkFastString str) uniq (mkPiTypes vars body_ty)) + returnUs (mkSysLocal (mkFastString str) uniq (mkPiTypes vars body_ty)) -- The deeply tiresome thing is that we have to apply the substitution -- to the rules inside each Id. Grr. But it matters. diff --git a/ghc/compiler/simplCore/SimplMonad.lhs b/ghc/compiler/simplCore/SimplMonad.lhs index b82562e..bc09e11 100644 --- a/ghc/compiler/simplCore/SimplMonad.lhs +++ b/ghc/compiler/simplCore/SimplMonad.lhs @@ -34,7 +34,6 @@ import UniqSupply ( uniqsFromSupply, uniqFromSupply, splitUniqSupply, ) import DynFlags ( SimplifierSwitch(..), DynFlags, DynFlag(..), dopt ) import StaticFlags ( opt_PprStyle_Debug, opt_HistorySize ) -import OccName ( EncodedFS ) import Unique ( Unique ) import Maybes ( expectJust ) import FiniteMap ( FiniteMap, emptyFM, isEmptyFM, lookupFM, addToFM, plusFM_C, fmToList ) @@ -160,7 +159,7 @@ getDOptsSmpl :: SimplM DynFlags getDOptsSmpl = SM (\dflags us sc -> (dflags, us, sc)) -newId :: EncodedFS -> Type -> SimplM Id +newId :: FastString -> Type -> SimplM Id newId fs ty = getUniqueSmpl `thenSmpl` \ uniq -> returnSmpl (mkSysLocal fs uniq ty) \end{code} diff --git a/ghc/compiler/simplCore/Simplify.lhs b/ghc/compiler/simplCore/Simplify.lhs index 8859140..17a7969 100644 --- a/ghc/compiler/simplCore/Simplify.lhs +++ b/ghc/compiler/simplCore/Simplify.lhs @@ -28,7 +28,6 @@ import Id ( Id, idType, idInfo, idArity, isDataConWorkId, ) import MkId ( eRROR_ID ) import Literal ( mkStringLit ) -import OccName ( encodeFS ) import IdInfo ( OccInfo(..), isLoopBreaker, setArityInfo, zapDemandInfo, setUnfoldingInfo, @@ -1875,7 +1874,7 @@ mkDupableAlt env case_bndr' cont alt ) `thenSmpl` \ (final_bndrs', final_args) -> -- See comment about "$j" name above - newId (encodeFS FSLIT("$j")) (mkPiTypes final_bndrs' rhs_ty') `thenSmpl` \ join_bndr -> + newId FSLIT("$j") (mkPiTypes final_bndrs' rhs_ty') `thenSmpl` \ join_bndr -> -- Notice the funky mkPiTypes. If the contructor has existentials -- it's possible that the join point will be abstracted over -- type varaibles as well as term variables. diff --git a/ghc/compiler/stgSyn/CoreToStg.lhs b/ghc/compiler/stgSyn/CoreToStg.lhs index 9ae0d27..824caba 100644 --- a/ghc/compiler/stgSyn/CoreToStg.lhs +++ b/ghc/compiler/stgSyn/CoreToStg.lhs @@ -30,7 +30,7 @@ import VarSet import VarEnv import Maybes ( maybeToBool ) import Name ( getOccName, isExternalName, nameOccName ) -import OccName ( occNameUserString, occNameFS ) +import OccName ( occNameString, occNameFS ) import BasicTypes ( Arity ) import Packages ( HomeModules ) import StaticFlags ( opt_RuntimeTypes ) @@ -688,7 +688,7 @@ coreToStgLet let_no_escape bind body is_join_var :: Id -> Bool -- A hack (used only for compiler debuggging) to tell if -- a variable started life as a join point ($j) -is_join_var j = occNameUserString (getOccName j) == "$j" +is_join_var j = occNameString (getOccName j) == "$j" \end{code} \begin{code} diff --git a/ghc/compiler/typecheck/Inst.lhs b/ghc/compiler/typecheck/Inst.lhs index 71d3e84..d6cf344 100644 --- a/ghc/compiler/typecheck/Inst.lhs +++ b/ghc/compiler/typecheck/Inst.lhs @@ -79,7 +79,7 @@ import DataCon ( DataCon, dataConTyVars, dataConStupidTheta, dataConName, dataCo import Id ( Id, idName, idType, mkUserLocal, mkLocalId ) import PrelInfo ( isNoDictClass ) import Name ( Name, mkMethodOcc, getOccName, getSrcLoc, nameModule, - isInternalName, setNameUnique, mkSystemVarNameEncoded ) + isInternalName, setNameUnique, mkSystemVarName ) import NameSet ( addOneToNameSet ) import Literal ( inIntRange ) import Var ( TyVar, tyVarKind, setIdType ) @@ -398,9 +398,7 @@ newLitInst orig lit expected_ty -- Make a LitInst = do { loc <- getInstLoc orig ; new_uniq <- newUnique ; let - lit_nm = mkSystemVarNameEncoded new_uniq FSLIT("lit") - -- The "encoded" bit means that we don't need to - -- z-encode the string every time we call this! + lit_nm = mkSystemVarName new_uniq FSLIT("lit") lit_inst = LitInst lit_nm lit expected_ty loc ; extendLIE lit_inst ; return (HsVar (instToId lit_inst)) } diff --git a/ghc/compiler/typecheck/TcClassDcl.lhs b/ghc/compiler/typecheck/TcClassDcl.lhs index b382af9..fbb450a 100644 --- a/ghc/compiler/typecheck/TcClassDcl.lhs +++ b/ghc/compiler/typecheck/TcClassDcl.lhs @@ -52,7 +52,6 @@ import RdrName ( RdrName, mkDerivedRdrName ) import Outputable import PrelNames ( genericTyConNames ) import DynFlags -import UnicodeUtil ( stringToUtf8 ) import ErrUtils ( dumpIfSet_dyn ) import Util ( count, lengthIs, isSingleton, lengthExceeds ) import Unique ( Uniquable(..) ) @@ -487,7 +486,7 @@ mkDefMethRhs origin clas inst_tys sel_id loc NoDefMeth where error_rhs = noLoc $ HsLam (mkMatchGroup [mkSimpleMatch wild_pats simple_rhs]) simple_rhs = nlHsApp (nlHsVar (getName nO_METHOD_BINDING_ERROR_ID)) - (nlHsLit (HsStringPrim (mkFastString (stringToUtf8 error_msg)))) + (nlHsLit (HsStringPrim (mkFastString error_msg))) error_msg = showSDoc (hcat [ppr loc, text "|", ppr sel_id ]) -- When the type is of form t1 -> t2 -> t3 diff --git a/ghc/compiler/typecheck/TcGenDeriv.lhs b/ghc/compiler/typecheck/TcGenDeriv.lhs index faa32ec..94bb152 100644 --- a/ghc/compiler/typecheck/TcGenDeriv.lhs +++ b/ghc/compiler/typecheck/TcGenDeriv.lhs @@ -809,7 +809,7 @@ gen_Read_binds get_fixity tycon ident_pat s = nlConPat ident_RDR [nlLitPat (mkHsString s)] -- Ident "foo" symbol_pat s = nlConPat symbol_RDR [nlLitPat (mkHsString s)] -- Symbol ">>" - data_con_str con = occNameUserString (getOccName con) + data_con_str con = occNameString (getOccName con) read_punc c = bindLex (punc_pat c) read_arg a ty @@ -832,7 +832,7 @@ gen_Read_binds get_fixity tycon | otherwise = [bindLex (ident_pat lbl_str)] where - lbl_str = occNameUserString (getOccName lbl) + lbl_str = occNameString (getOccName lbl) \end{code} @@ -899,7 +899,7 @@ gen_Show_binds get_fixity tycon dc_nm = getName data_con dc_occ_nm = getOccName data_con - con_str = occNameUserString dc_occ_nm + con_str = occNameString dc_occ_nm op_con_str = wrapOpParens con_str backquote_str = wrapOpBackquotes con_str @@ -916,7 +916,7 @@ gen_Show_binds get_fixity tycon -- it seems tidier to have them both sides. where occ_nm = getOccName l - nm = wrapOpParens (occNameUserString occ_nm) + nm = wrapOpParens (occNameString occ_nm) show_args = zipWith show_arg bs_needed arg_tys (show_arg1:show_arg2:_) = show_args @@ -1128,7 +1128,7 @@ gen_Data_binds fix_env tycon constr_args dc = [ -- nlHsIntLit (toInteger (dataConTag dc)), -- Tag nlHsVar data_type_name, -- DataType - nlHsLit (mkHsString (occNameUserString dc_occ)), -- String name + nlHsLit (mkHsString (occNameString dc_occ)), -- String name nlList labels, -- Field labels nlHsVar fixity] -- Fixity where @@ -1458,7 +1458,7 @@ mk_tc_deriv_name tycon str = mkDerivedRdrName tc_name mk_occ where tc_name = tyConName tycon - mk_occ tc_occ = mkOccFS varName (mkFastString new_str) + mk_occ tc_occ = mkVarOccFS (mkFastString new_str) where new_str = str ++ occNameString tc_occ ++ "#" \end{code} diff --git a/ghc/compiler/typecheck/TcInstDcls.lhs b/ghc/compiler/typecheck/TcInstDcls.lhs index 432d3c8..04fbafb 100644 --- a/ghc/compiler/typecheck/TcInstDcls.lhs +++ b/ghc/compiler/typecheck/TcInstDcls.lhs @@ -35,7 +35,6 @@ import Var ( Id, idName, idType ) import MkId ( mkDictFunId, rUNTIME_ERROR_ID ) import FunDeps ( checkInstFDs ) import Name ( Name, getSrcLoc ) -import UnicodeUtil ( stringToUtf8 ) import Maybe ( catMaybes ) import SrcLoc ( srcLocSpan, unLoc, noLoc, Located(..), srcSpanStart ) import ListSetOps ( minusList ) @@ -405,7 +404,7 @@ tcInstDecl2 (InstInfo { iSpec = ispec, iBinds = binds }) -- Hardly beautiful, but only three extra lines. nlHsApp (noLoc $ TyApp (nlHsVar rUNTIME_ERROR_ID) [idType this_dict_id]) - (nlHsLit (HsStringPrim (mkFastString (stringToUtf8 msg)))) + (nlHsLit (HsStringPrim (mkFastString msg))) | otherwise -- The common case = mkHsConApp dict_constr inst_tys' (map HsVar scs_and_meths) diff --git a/ghc/compiler/typecheck/TcRnDriver.lhs b/ghc/compiler/typecheck/TcRnDriver.lhs index b2e665f..7e3aae2 100644 --- a/ghc/compiler/typecheck/TcRnDriver.lhs +++ b/ghc/compiler/typecheck/TcRnDriver.lhs @@ -63,7 +63,7 @@ import ErrUtils ( Messages, mkDumpDoc, showPass ) import Id ( Id, mkExportedLocalId, isLocalId, idName, idType ) import Var ( Var ) import Module ( Module, ModuleEnv, moduleEnvElts, elemModuleEnv ) -import OccName ( mkVarOcc, mkOccFS, varName ) +import OccName ( mkVarOccFS ) import Name ( Name, NamedThing(..), isExternalName, getSrcLoc, isWiredInName, mkExternalName ) import NameSet @@ -734,7 +734,7 @@ checkMain dflags <- getDOpts ; let { main_mod = mainModIs dflags ; main_fn = case mainFunIs dflags of { - Just fn -> mkRdrUnqual (mkVarOcc (mkFastString fn)) ; + Just fn -> mkRdrUnqual (mkVarOccFS (mkFastString fn)) ; Nothing -> main_RDR_Unqual } } ; check_main ghci_mode tcg_env main_mod main_fn @@ -776,7 +776,7 @@ check_main ghci_mode tcg_env main_mod main_fn -- for 'main' in the interface file! ; let { root_main_name = mkExternalName rootMainKey rOOT_MAIN - (mkOccFS varName FSLIT("main")) + (mkVarOccFS FSLIT("main")) (Just main_name) (getSrcLoc main_name) ; root_main_id = mkExportedLocalId root_main_name ty ; main_bind = noLoc (VarBind root_main_id main_expr) } diff --git a/ghc/compiler/typecheck/TcSplice.lhs b/ghc/compiler/typecheck/TcSplice.lhs index 2844ab4..578c96b 100644 --- a/ghc/compiler/typecheck/TcSplice.lhs +++ b/ghc/compiler/typecheck/TcSplice.lhs @@ -41,7 +41,7 @@ import NameEnv ( lookupNameEnv ) import HscTypes ( lookupType, ExternalPackageState(..), emptyModDetails ) import OccName import Var ( Id, TyVar, idType ) -import Module ( moduleUserString ) +import Module ( moduleString ) import TcRnMonad import IfaceEnv ( lookupOrig ) import Class ( Class, classExtraBigSig ) @@ -416,7 +416,7 @@ instance TH.Quasi (IOEnv (Env TcGblEnv TcLclEnv)) where qReport True msg = addErr (text msg) qReport False msg = addReport (text msg) - qCurrentModule = do { m <- getModule; return (moduleUserString m) } + qCurrentModule = do { m <- getModule; return (moduleString m) } qReify v = reify v qRecover = recoverM @@ -659,8 +659,8 @@ reifyName thing -- have free variables, we may need to generate NameL's for them. where name = getName thing - mod = moduleUserString (nameModule name) - occ_str = occNameUserString occ + mod = moduleString (nameModule name) + occ_str = occNameString occ occ = nameOccName name mk_varg | OccName.isDataOcc occ = TH.mkNameG_d | OccName.isVarOcc occ = TH.mkNameG_v diff --git a/ghc/compiler/types/TypeRep.lhs b/ghc/compiler/types/TypeRep.lhs index dc53445..7bb863a 100644 --- a/ghc/compiler/types/TypeRep.lhs +++ b/ghc/compiler/types/TypeRep.lhs @@ -33,7 +33,7 @@ import Kind import Var ( Var, Id, TyVar, tyVarKind ) import VarSet ( TyVarSet ) import Name ( Name, NamedThing(..), BuiltInSyntax(..), mkWiredInName ) -import OccName ( mkOccFS, tcName, parenSymOcc ) +import OccName ( mkOccNameFS, tcName, parenSymOcc ) import BasicTypes ( IPName, tupleParens ) import TyCon ( TyCon, mkFunTyCon, tyConArity, tupleTyConBoxity, isTupleTyCon, isRecursiveTyCon, isNewTyCon ) import Class ( Class ) @@ -273,7 +273,7 @@ funTyCon = mkFunTyCon funTyConName (mkArrowKinds [argTypeKind, openTypeKind] lif -- a prefix way, thus: (->) Int# Int#. And this is unusual. funTyConName = mkWiredInName gHC_PRIM - (mkOccFS tcName FSLIT("(->)")) + (mkOccNameFS tcName FSLIT("(->)")) funTyConKey Nothing -- No parent object (ATyCon funTyCon) -- Relevant TyCon diff --git a/ghc/compiler/utils/Binary.hs b/ghc/compiler/utils/Binary.hs index 1902ff1..7b40bd2 100644 --- a/ghc/compiler/utils/Binary.hs +++ b/ghc/compiler/utils/Binary.hs @@ -58,26 +58,7 @@ import UniqFM import FastMutInt import PackageConfig ( PackageId, packageIdFS, fsToPackageId ) -#if __GLASGOW_HASKELL__ < 503 -import DATA_IOREF -import DATA_BITS -import DATA_INT -import DATA_WORD -import Char -import Monad -import Exception -import GlaExts hiding (ByteArray, newByteArray, freezeByteArray) -import Array -import IO -import PrelIOBase ( IOError(..), IOErrorType(..) -#if __GLASGOW_HASKELL__ > 411 - , IOException(..) -#endif - ) -import PrelReal ( Ratio(..) ) -import PrelIOBase ( IO(..) ) -import IOExts ( openFileEx, IOModeEx(..) ) -#else +import Foreign import Data.Array.IO import Data.Array import Data.Bits @@ -102,44 +83,12 @@ import GHC.Handle ( openFileEx, IOModeEx(..) ) #else import System.IO ( openBinaryFile ) #endif -#endif #if __GLASGOW_HASKELL__ < 601 openBinaryFile f mode = openFileEx f (BinaryMode mode) #endif -#if __GLASGOW_HASKELL__ < 503 -type BinArray = MutableByteArray RealWorld Int -newArray_ bounds = stToIO (newCharArray bounds) -unsafeWrite arr ix e = stToIO (writeWord8Array arr ix e) -unsafeRead arr ix = stToIO (readWord8Array arr ix) -#if __GLASGOW_HASKELL__ < 411 -newByteArray# = newCharArray# -#endif -hPutArray h arr sz = hPutBufBAFull h arr sz -hGetArray h sz = hGetBufBAFull h sz - -mkIOError :: IOErrorType -> String -> Maybe Handle -> Maybe FilePath -> Exception -mkIOError t location maybe_hdl maybe_filename - = IOException (IOError maybe_hdl t location "" -#if __GLASGOW_HASKELL__ > 411 - maybe_filename -#endif - ) - -eofErrorType = EOF - -#ifndef SIZEOF_HSINT -#define SIZEOF_HSINT INT_SIZE_IN_BYTES -#endif - -#ifndef SIZEOF_HSWORD -#define SIZEOF_HSWORD WORD_SIZE_IN_BYTES -#endif - -#else type BinArray = IOUArray Int Word8 -#endif --------------------------------------------------------------- -- BinHandle @@ -741,13 +690,17 @@ constructDictionary j fm = array (0,j-1) (eltsUFM fm) -- Reading and writing FastStrings --------------------------------------------------------- -putFS bh (FastString id l ba) = do - put_ bh (I# l) - putByteArray bh ba l -putFS bh s = error ("Binary.put_(FastString): " ++ unpackFS s) - -- Note: the length of the FastString is *not* the same as - -- the size of the ByteArray: the latter is rounded up to a - -- multiple of the word size. +putFS bh (FastString id l _ buf _) = do + put_ bh l + withForeignPtr buf $ \ptr -> + let + go n | n == l = return () + | otherwise = do + b <- peekElemOff ptr n + putByte bh b + go (n+1) + in + go 0 {- -- possible faster version, not quite there yet: getFS bh@BinMem{} = do @@ -757,16 +710,24 @@ getFS bh@BinMem{} = do return $! (mkFastSubStringBA# arr off l) -} getFS bh = do - (I# l) <- get bh - (BA ba) <- getByteArray bh (I# l) - return $! (mkFastSubStringBA# ba 0# l) + l <- get bh + fp <- mallocForeignPtrBytes l + withForeignPtr fp $ \ptr -> do + let + go n | n == l = mkFastStringForeignPtr ptr fp l + | otherwise = do + b <- getByte bh + pokeElemOff ptr n b + go (n+1) + -- + go 0 instance Binary PackageId where put_ bh pid = put_ bh (packageIdFS pid) get bh = do { fs <- get bh; return (fsToPackageId fs) } instance Binary FastString where - put_ bh f@(FastString id l ba) = + put_ bh f@(FastString id l _ fp _) = case getUserData bh of { UserData { ud_next = j_r, ud_map = out_r, ud_dict = dict} -> do out <- readIORef out_r diff --git a/ghc/compiler/utils/BufWrite.hs b/ghc/compiler/utils/BufWrite.hs index 6d00e46..b15089e 100644 --- a/ghc/compiler/utils/BufWrite.hs +++ b/ghc/compiler/utils/BufWrite.hs @@ -31,17 +31,11 @@ import Char ( ord ) import Foreign import IO -#if __GLASGOW_HASKELL__ < 503 -import PrelIOBase ( IO(..) ) -import IOExts ( hPutBufFull ) -#else import GHC.IOBase ( IO(..) ) import System.IO ( hPutBuf ) -#endif - -import GLAEXTS ( touch#, byteArrayContents#, Int(..), Int#, Addr# ) +import GHC.Ptr ( Ptr(..) ) -import PrimPacked ( Ptr(..) ) +import GLAEXTS ( Int(..), Int#, Addr# ) -- ----------------------------------------------------------------------------- @@ -88,22 +82,17 @@ bPutStr b@(BufHandle buf r hdl) str = do loop cs (i+1) bPutFS :: BufHandle -> FastString -> IO () -bPutFS b@(BufHandle buf r hdl) fs@(FastString _ len# arr#) = do - let len = I# len# +bPutFS b@(BufHandle buf r hdl) fs@(FastString _ len _ fp _) = + withForeignPtr fp $ \ptr -> do i <- readFastMutInt r if (i + len) >= buf_size then do hPutBuf hdl buf i writeFastMutInt r 0 if (len >= buf_size) - then do - let a# = byteArrayContents# arr# - hPutBuf hdl (Ptr a#) len - touch fs + then hPutBuf hdl ptr len else bPutFS b fs else do - let a# = byteArrayContents# arr# - copyBytes (buf `plusPtr` i) (Ptr a#) len - touch fs + copyBytes (buf `plusPtr` i) ptr len writeFastMutInt r (i+len) bPutFS _ _ = panic "bPutFS" @@ -128,8 +117,6 @@ bFlush b@(BufHandle buf r hdl) = do free buf return () -touch r = IO $ \s -> case touch# r s of s -> (# s, () #) - #if 0 myPutBuf s hdl buf i = modifyIOError (\e -> ioeSetErrorString e (ioeGetErrorString e ++ ':':s ++ " (" ++ show buf ++ "," ++ show i ++ ")")) $ diff --git a/ghc/compiler/utils/Encoding.hs b/ghc/compiler/utils/Encoding.hs new file mode 100644 index 0000000..d15c021 --- /dev/null +++ b/ghc/compiler/utils/Encoding.hs @@ -0,0 +1,386 @@ +{-# OPTIONS_GHC -O #-} +-- ----------------------------------------------------------------------------- +-- +-- (c) The University of Glasgow, 1997-2003 +-- +-- Character encodings +-- +-- ----------------------------------------------------------------------------- + +module Encoding ( + -- * UTF-8 + utf8DecodeChar#, + utf8PrevChar, + utf8CharStart, + utf8DecodeChar, + utf8DecodeString, + utf8EncodeChar, + utf8EncodeString, + utf8EncodedLength, + countUTF8Chars, + + -- * Latin-1 + latin1DecodeChar, + latin1EncodeChar, + + -- * Z-encoding + zEncodeString, + zDecodeString + ) where + +#define COMPILING_FAST_STRING +#include "HsVersions.h" +import Foreign +import Data.Char ( ord, chr, isDigit, digitToInt, isHexDigit ) +import Numeric ( showHex ) + +import GHC.Ptr ( Ptr(..) ) +import GHC.Base + +-- ----------------------------------------------------------------------------- +-- Latin-1 + +latin1DecodeChar ptr = do + w <- peek ptr + return (unsafeChr (fromIntegral w), ptr `plusPtr` 1) + +latin1EncodeChar c ptr = do + poke ptr (fromIntegral (ord c)) + return (ptr `plusPtr` 1) + +-- ----------------------------------------------------------------------------- +-- UTF-8 + +-- We can't write the decoder as efficiently as we'd like without +-- resorting to unboxed extensions, unfortunately. I tried to write +-- an IO version of this function, but GHC can't eliminate boxed +-- results from an IO-returning function. +-- +-- We assume we can ignore overflow when parsing a multibyte character here. +-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences +-- before decoding them (see StringBuffer.hs). + +{-# INLINE utf8DecodeChar# #-} +utf8DecodeChar# :: Addr# -> (# Char#, Addr# #) +utf8DecodeChar# a# = + let ch0 = word2Int# (indexWord8OffAddr# a# 0#) in + case () of + _ | ch0 <=# 0x7F# -> (# chr# ch0, a# `plusAddr#` 1# #) + + | ch0 >=# 0xC0# && ch0 <=# 0xDF# -> + let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in + if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else + (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +# + (ch1 -# 0x80#)), + a# `plusAddr#` 2# #) + + | ch0 >=# 0xE0# && ch0 <=# 0xEF# -> + let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in + if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else + let ch2 = word2Int# (indexWord8OffAddr# a# 2#) in + if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else + (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +# + ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ch2 -# 0x80#)), + a# `plusAddr#` 3# #) + + | ch0 >=# 0xF0# && ch0 <=# 0xF8# -> + let ch1 = word2Int# (indexWord8OffAddr# a# 1#) in + if ch1 <# 0x80# || ch1 >=# 0xC0# then fail 1# else + let ch2 = word2Int# (indexWord8OffAddr# a# 2#) in + if ch2 <# 0x80# || ch2 >=# 0xC0# then fail 2# else + let ch3 = word2Int# (indexWord8OffAddr# a# 3#) in + if ch3 <# 0x80# || ch3 >=# 0xC0# then fail 3# else + (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +# + ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +# + ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +# + (ch3 -# 0x80#)), + a# `plusAddr#` 4# #) + + | otherwise -> fail 1# + where + -- all invalid sequences end up here: + fail n = (# '\0'#, a# `plusAddr#` n #) + -- '\xFFFD' would be the usual replacement character, but + -- that's a valid symbol in Haskell, so will result in a + -- confusing parse error later on. Instead we use '\0' which + -- will signal a lexer error immediately. + +utf8DecodeChar :: Ptr Word8 -> (Char, Ptr Word8) +utf8DecodeChar (Ptr a#) = ( C# c#, Ptr b# ) + where (# c#, b# #) = utf8DecodeChar# a# + +-- UTF-8 is cleverly designed so that we can always figure out where +-- the start of the current character is, given any position in a +-- stream. This function finds the start of the previous character, +-- assuming there *is* a previous character. +utf8PrevChar :: Ptr Word8 -> IO (Ptr Word8) +utf8PrevChar p = utf8CharStart (p `plusPtr` (-1)) + +utf8CharStart :: Ptr Word8 -> IO (Ptr Word8) +utf8CharStart p = go p + where go p = do w <- peek p + if (w .&. 0xC0) == 0x80 + then go (p `plusPtr` (-1)) + else return p + +utf8DecodeString :: Ptr Word8 -> Int -> IO [Char] +STRICT2(utf8DecodeString) +utf8DecodeString (Ptr a#) (I# len#) + = unpack a# + where + end# = addr2Int# (a# `plusAddr#` len#) + + unpack p# + | addr2Int# p# >=# end# = return [] + | otherwise = + case utf8DecodeChar# p# of + (# c#, q# #) -> do + chs <- unpack q# + return (C# c# : chs) + +countUTF8Chars :: Ptr Word8 -> Int -> IO Int +countUTF8Chars ptr bytes = go ptr 0 + where + end = ptr `plusPtr` bytes + + STRICT2(go) + go ptr n + | ptr >= end = return n + | otherwise = do + case utf8DecodeChar# (unPtr ptr) of + (# c, a #) -> go (Ptr a) (n+1) + +unPtr (Ptr a) = a + +utf8EncodeChar c ptr = + let x = ord c in + case () of + _ | x > 0 && x <= 0x007f -> do + poke ptr (fromIntegral x) + return (ptr `plusPtr` 1) + -- NB. '\0' is encoded as '\xC0\x80', not '\0'. This is so that we + -- can have 0-terminated UTF-8 strings (see GHC.Base.unpackCStringUtf8). + | x <= 0x07ff -> do + poke ptr (fromIntegral (0xC0 .|. ((x `shiftR` 6) .&. 0x1F))) + pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x .&. 0x3F))) + return (ptr `plusPtr` 2) + | x <= 0xffff -> do + poke ptr (fromIntegral (0xE0 .|. (x `shiftR` 12) .&. 0x0F)) + pokeElemOff ptr 1 (fromIntegral (0x80 .|. (x `shiftR` 6) .&. 0x3F)) + pokeElemOff ptr 2 (fromIntegral (0x80 .|. (x .&. 0x3F))) + return (ptr `plusPtr` 3) + | otherwise -> do + poke ptr (fromIntegral (0xF0 .|. (x `shiftR` 18))) + pokeElemOff ptr 1 (fromIntegral (0x80 .|. ((x `shiftR` 12) .&. 0x3F))) + pokeElemOff ptr 2 (fromIntegral (0x80 .|. ((x `shiftR` 6) .&. 0x3F))) + pokeElemOff ptr 3 (fromIntegral (0x80 .|. (x .&. 0x3F))) + return (ptr `plusPtr` 4) + +utf8EncodeString :: Ptr Word8 -> String -> IO () +utf8EncodeString ptr str = go ptr str + where STRICT2(go) + go ptr [] = return () + go ptr (c:cs) = do + ptr' <- utf8EncodeChar c ptr + go ptr' cs + +utf8EncodedLength :: String -> Int +utf8EncodedLength str = go 0 str + where STRICT2(go) + go n [] = n + go n (c:cs) + | ord c > 0 && ord c <= 0x007f = go (n+1) cs + | ord c <= 0x07ff = go (n+2) cs + | ord c <= 0xffff = go (n+3) cs + | otherwise = go (n+4) cs + +-- ----------------------------------------------------------------------------- +-- The Z-encoding + +{- +This is the main name-encoding and decoding function. It encodes any +string into a string that is acceptable as a C name. This is the name +by which things are known right through the compiler. + +The basic encoding scheme is this. + +* Tuples (,,,) are coded as Z3T + +* Alphabetic characters (upper and lower) and digits + all translate to themselves; + except 'Z', which translates to 'ZZ' + and 'z', which translates to 'zz' + We need both so that we can preserve the variable/tycon distinction + +* Most other printable characters translate to 'zx' or 'Zx' for some + alphabetic character x + +* The others translate as 'znnnU' where 'nnn' is the decimal number + of the character + + Before After + -------------------------- + Trak Trak + foo_wib foozuwib + > zg + >1 zg1 + foo# foozh + foo## foozhzh + foo##1 foozhzh1 + fooZ fooZZ + :+ ZCzp + () Z0T 0-tuple + (,,,,) Z5T 5-tuple + (# #) Z1H unboxed 1-tuple (note the space) + (#,,,,#) Z5H unboxed 5-tuple + (NB: There is no Z1T nor Z0H.) +-} + +type UserString = String -- As the user typed it +type EncodedString = String -- Encoded form + + +zEncodeString :: UserString -> EncodedString +zEncodeString cs = case maybe_tuple cs of + Just n -> n -- Tuples go to Z2T etc + Nothing -> go cs + where + go [] = [] + go (c:cs) = encode_ch c ++ go cs + +unencodedChar :: Char -> Bool -- True for chars that don't need encoding +unencodedChar 'Z' = False +unencodedChar 'z' = False +unencodedChar c = c >= 'a' && c <= 'z' + || c >= 'A' && c <= 'Z' + || c >= '0' && c <= '9' + +encode_ch :: Char -> EncodedString +encode_ch c | unencodedChar c = [c] -- Common case first + +-- Constructors +encode_ch '(' = "ZL" -- Needed for things like (,), and (->) +encode_ch ')' = "ZR" -- For symmetry with ( +encode_ch '[' = "ZM" +encode_ch ']' = "ZN" +encode_ch ':' = "ZC" +encode_ch 'Z' = "ZZ" + +-- Variables +encode_ch 'z' = "zz" +encode_ch '&' = "za" +encode_ch '|' = "zb" +encode_ch '^' = "zc" +encode_ch '$' = "zd" +encode_ch '=' = "ze" +encode_ch '>' = "zg" +encode_ch '#' = "zh" +encode_ch '.' = "zi" +encode_ch '<' = "zl" +encode_ch '-' = "zm" +encode_ch '!' = "zn" +encode_ch '+' = "zp" +encode_ch '\'' = "zq" +encode_ch '\\' = "zr" +encode_ch '/' = "zs" +encode_ch '*' = "zt" +encode_ch '_' = "zu" +encode_ch '%' = "zv" +encode_ch c = 'z' : if isDigit (head hex_str) then hex_str + else '0':hex_str + where hex_str = showHex (ord c) "U" + -- ToDo: we could improve the encoding here in various ways. + -- eg. strings of unicode characters come out as 'z1234Uz5678U', we + -- could remove the 'U' in the middle (the 'z' works as a separator). + +zDecodeString :: EncodedString -> UserString +zDecodeString [] = [] +zDecodeString ('Z' : d : rest) + | isDigit d = decode_tuple d rest + | otherwise = decode_upper d : zDecodeString rest +zDecodeString ('z' : d : rest) + | isDigit d = decode_num_esc d rest + | otherwise = decode_lower d : zDecodeString rest +zDecodeString (c : rest) = c : zDecodeString rest + +decode_upper, decode_lower :: Char -> Char + +decode_upper 'L' = '(' +decode_upper 'R' = ')' +decode_upper 'M' = '[' +decode_upper 'N' = ']' +decode_upper 'C' = ':' +decode_upper 'Z' = 'Z' +decode_upper ch = {-pprTrace "decode_upper" (char ch)-} ch + +decode_lower 'z' = 'z' +decode_lower 'a' = '&' +decode_lower 'b' = '|' +decode_lower 'c' = '^' +decode_lower 'd' = '$' +decode_lower 'e' = '=' +decode_lower 'g' = '>' +decode_lower 'h' = '#' +decode_lower 'i' = '.' +decode_lower 'l' = '<' +decode_lower 'm' = '-' +decode_lower 'n' = '!' +decode_lower 'p' = '+' +decode_lower 'q' = '\'' +decode_lower 'r' = '\\' +decode_lower 's' = '/' +decode_lower 't' = '*' +decode_lower 'u' = '_' +decode_lower 'v' = '%' +decode_lower ch = {-pprTrace "decode_lower" (char ch)-} ch + +-- Characters not having a specific code are coded as z224U (in hex) +decode_num_esc d rest + = go (digitToInt d) rest + where + go n (c : rest) | isHexDigit c = go (16*n + digitToInt c) rest + go n ('U' : rest) = chr n : zDecodeString rest + go n other = error ("decode_num_esc: " ++ show n ++ ' ':other) + +decode_tuple :: Char -> EncodedString -> UserString +decode_tuple d rest + = go (digitToInt d) rest + where + -- NB. recurse back to zDecodeString after decoding the tuple, because + -- the tuple might be embedded in a longer name. + go n (c : rest) | isDigit c = go (10*n + digitToInt c) rest + go 0 ('T':rest) = "()" ++ zDecodeString rest + go n ('T':rest) = '(' : replicate (n-1) ',' ++ ")" ++ zDecodeString rest + go 1 ('H':rest) = "(# #)" ++ zDecodeString rest + go n ('H':rest) = '(' : '#' : replicate (n-1) ',' ++ "#)" ++ zDecodeString rest + go n other = error ("decode_tuple: " ++ show n ++ ' ':other) + +{- +Tuples are encoded as + Z3T or Z3H +for 3-tuples or unboxed 3-tuples respectively. No other encoding starts + Z + +* "(# #)" is the tycon for an unboxed 1-tuple (not 0-tuple) + There are no unboxed 0-tuples. + +* "()" is the tycon for a boxed 0-tuple. + There are no boxed 1-tuples. +-} + +maybe_tuple :: UserString -> Maybe EncodedString + +maybe_tuple "(# #)" = Just("Z1H") +maybe_tuple ('(' : '#' : cs) = case count_commas (0::Int) cs of + (n, '#' : ')' : cs) -> Just ('Z' : shows (n+1) "H") + other -> Nothing +maybe_tuple "()" = Just("Z0T") +maybe_tuple ('(' : cs) = case count_commas (0::Int) cs of + (n, ')' : cs) -> Just ('Z' : shows (n+1) "T") + other -> Nothing +maybe_tuple other = Nothing + +count_commas :: Int -> String -> (Int, String) +count_commas n (',' : cs) = count_commas (n+1) cs +count_commas n cs = (n,cs) diff --git a/ghc/compiler/utils/FastString.lhs b/ghc/compiler/utils/FastString.lhs index 52512d3..2558c56 100644 --- a/ghc/compiler/utils/FastString.lhs +++ b/ghc/compiler/utils/FastString.lhs @@ -1,8 +1,10 @@ % -% (c) The GRASP/AQUA Project, Glasgow University, 1997-1998 +% (c) The University of Glasgow, 1997-2006 % -\section{Fast strings} +\begin{code} +{-# OPTIONS -fglasgow-exts -O #-} +{- FastString: A compact, hash-consed, representation of character strings. Comparison is O(1), and you can get a Unique from them. Generated by the FSLIT macro @@ -15,40 +17,46 @@ LitString: Just a wrapper for the Addr# of a C string (Ptr CChar). Turn into SDoc with Outputable.ptext Use LitString unless you want the facilities of FastString - -\begin{code} +-} module FastString ( + -- * FastStrings FastString(..), -- not abstract, for now. - mkFastString, -- :: String -> FastString - mkFastStringNarrow, -- :: String -> FastString - mkFastSubString, -- :: Addr -> Int -> Int -> FastString - - mkFastString#, -- :: Addr# -> FastString - mkFastSubStringBA#, -- :: ByteArray# -> Int# -> Int# -> FastString - - mkFastStringInt, -- :: [Int] -> FastString - - uniqueOfFS, -- :: FastString -> Int# - lengthFS, -- :: FastString -> Int - nullFastString, -- :: FastString -> Bool + -- ** Construction + mkFastString, + mkFastStringBytes, + mkFastStringForeignPtr, + mkFastString#, + mkZFastString, + mkZFastStringBytes, + -- ** Deconstruction unpackFS, -- :: FastString -> String - unpackIntFS, -- :: FastString -> [Int] - appendFS, -- :: FastString -> FastString -> FastString - headFS, -- :: FastString -> Char - headIntFS, -- :: FastString -> Int - tailFS, -- :: FastString -> FastString - concatFS, -- :: [FastString] -> FastString - consFS, -- :: Char -> FastString -> FastString - indexFS, -- :: FastString -> Int -> Char - nilFS, -- :: FastString - - hPutFS, -- :: Handle -> FastString -> IO () - + bytesFS, -- :: FastString -> [Word8] + + -- ** Encoding + isZEncoded, + zEncodeFS, + + -- ** Operations + uniqueOfFS, + lengthFS, + nullFS, + appendFS, + headFS, + tailFS, + concatFS, + consFS, + nilFS, + + -- ** Outputing + hPutFS, + + -- * LitStrings LitString, - mkLitString# -- :: Addr# -> LitString + mkLitString#, + strLength ) where -- This #define suppresses the "import FastString" that @@ -56,64 +64,49 @@ module FastString #define COMPILING_FAST_STRING #include "HsVersions.h" -#if __GLASGOW_HASKELL__ < 503 -import PrelIOBase ( IO(..) ) -#else -import GHC.IOBase ( IO(..) ) -#endif +import Encoding -import PrimPacked +import Foreign +import Foreign.C import GLAEXTS import UNSAFE_IO ( unsafePerformIO ) import MONAD_ST ( stToIO ) import DATA_IOREF ( IORef, newIORef, readIORef, writeIORef ) +import System.IO ( hPutBuf ) -#if __GLASGOW_HASKELL__ < 503 -import PrelArr ( STArray(..), newSTArray ) -#else import GHC.Arr ( STArray(..), newSTArray ) -#endif - -#if __GLASGOW_HASKELL__ >= 504 -import GHC.IOBase -import GHC.Handle -import Foreign.C -#else -import IOExts ( hPutBufBAFull ) -#endif +import GHC.IOBase ( IO(..) ) import IO -import Char ( chr, ord ) #define hASH_TBL_SIZE 4091 -\end{code} -@FastString@s are packed representations of strings -with a unique id for fast comparisons. The unique id -is assigned when creating the @FastString@, using -a hash table to map from the character string representation -to the unique ID. -\begin{code} -data FastString - = FastString -- packed repr. on the heap. - Int# -- unique id - -- 0 => string literal, comparison - -- will - Int# -- length - ByteArray# -- stuff - - | UnicodeStr -- if contains characters outside '\1'..'\xFF' - Int# -- unique id - [Int] -- character numbers +{-| +A 'FastString' is an array of bytes, hashed to support fast O(1) +comparison. It is also associated with a character encoding, so that +we know how to convert a 'FastString' to the local encoding, or to the +Z-encoding used by the compiler internally. -instance Eq FastString where - -- shortcut for real FastStrings - (FastString u1 _ _) == (FastString u2 _ _) = u1 ==# u2 - a == b = case cmpFS a b of { LT -> False; EQ -> True; GT -> False } +'FastString's support a memoized conversion to the Z-encoding via zEncodeFS. +-} + +data FastString = FastString { + uniq :: {-# UNPACK #-} !Int, -- unique id + n_bytes :: {-# UNPACK #-} !Int, -- number of bytes + n_chars :: {-# UNPACK #-} !Int, -- number of chars + buf :: {-# UNPACK #-} !(ForeignPtr Word8), + enc :: FSEncoding + } + +data FSEncoding + = ZEncoded + -- including strings that don't need any encoding + | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString)) + -- A UTF-8 string with a memoized Z-encoding - (FastString u1 _ _) /= (FastString u2 _ _) = u1 /=# u2 - a /= b = case cmpFS a b of { LT -> True; EQ -> False; GT -> True } +instance Eq FastString where + f1 == f2 = uniq f1 == uniq f2 instance Ord FastString where -- Compares lexicographically, not by unique @@ -130,360 +123,311 @@ instance Ord FastString where instance Show FastString where show fs = show (unpackFS fs) -lengthFS :: FastString -> Int -lengthFS (FastString _ l# _) = I# l# -lengthFS (UnicodeStr _ s) = length s - -nullFastString :: FastString -> Bool -nullFastString (FastString _ l# _) = l# ==# 0# -nullFastString (UnicodeStr _ []) = True -nullFastString (UnicodeStr _ (_:_)) = False - -unpackFS :: FastString -> String -unpackFS (FastString _ l# ba#) = unpackNBytesBA (BA ba#) (I# l#) -unpackFS (UnicodeStr _ s) = map chr s - -unpackIntFS :: FastString -> [Int] -unpackIntFS (UnicodeStr _ s) = s -unpackIntFS fs = map ord (unpackFS fs) - -appendFS :: FastString -> FastString -> FastString -appendFS fs1 fs2 = mkFastStringInt (unpackIntFS fs1 ++ unpackIntFS fs2) - -concatFS :: [FastString] -> FastString -concatFS ls = mkFastStringInt (concat (map unpackIntFS ls)) -- ToDo: do better - -headFS :: FastString -> Char -headFS (FastString _ l# ba#) = - if l# ># 0# then C# (indexCharArray# ba# 0#) else error ("headFS: empty FS") -headFS (UnicodeStr _ (c:_)) = chr c -headFS (UnicodeStr _ []) = error ("headFS: empty FS") - -headIntFS :: FastString -> Int -headIntFS (UnicodeStr _ (c:_)) = c -headIntFS fs = ord (headFS fs) - -indexFS :: FastString -> Int -> Char -indexFS f i@(I# i#) = - case f of - FastString _ l# ba# - | l# ># 0# && l# ># i# -> C# (indexCharArray# ba# i#) - | otherwise -> error (msg (I# l#)) - UnicodeStr _ s -> chr (s!!i) - where - msg l = "indexFS: out of range: " ++ show (l,i) - -tailFS :: FastString -> FastString -tailFS (FastString _ l# ba#) = mkFastSubStringBA# ba# 1# (l# -# 1#) -tailFS fs = mkFastStringInt (tail (unpackIntFS fs)) - -consFS :: Char -> FastString -> FastString -consFS c fs = mkFastStringInt (ord c : unpackIntFS fs) +cmpFS :: FastString -> FastString -> Ordering +cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) = + if u1 == u2 then EQ else + let l = if l1 <= l2 then l1 else l2 in + inlinePerformIO $ + withForeignPtr buf1 $ \p1 -> + withForeignPtr buf2 $ \p2 -> do + res <- memcmp p1 p2 l + case () of + _ | res < 0 -> return LT + | res == 0 -> if l1 == l2 then return EQ + else if l1 < l2 then return LT + else return GT + | otherwise -> return GT -uniqueOfFS :: FastString -> Int# -uniqueOfFS (FastString u# _ _) = u# -uniqueOfFS (UnicodeStr u# _) = u# +#ifndef __HADDOCK__ +foreign import ccall unsafe "ghc_memcmp" + memcmp :: Ptr a -> Ptr b -> Int -> IO Int +#endif -nilFS = mkFastString "" -\end{code} +-- ----------------------------------------------------------------------------- +-- Construction +{- Internally, the compiler will maintain a fast string symbol table, providing sharing and fast comparison. Creation of new @FastString@s then covertly does a lookup, re-using the @FastString@ if there was a hit. +-} -Caution: mkFastStringUnicode assumes that if the string is in the -table, it sits under the UnicodeStr constructor. Other mkFastString -variants analogously assume the FastString constructor. - -\begin{code} data FastStringTable = FastStringTable - Int# + {-# UNPACK #-} !Int (MutableArray# RealWorld [FastString]) -type FastStringTableVar = IORef FastStringTable - -string_table :: FastStringTableVar +string_table :: IORef FastStringTable string_table = - unsafePerformIO ( - stToIO (newSTArray (0::Int,hASH_TBL_SIZE) []) - >>= \ (STArray _ _ arr#) -> - newIORef (FastStringTable 0# arr#)) - -lookupTbl :: FastStringTable -> Int# -> IO [FastString] -lookupTbl (FastStringTable _ arr#) i# = - IO ( \ s# -> - readArray# arr# i# s#) - -updTbl :: FastStringTableVar -> FastStringTable -> Int# -> [FastString] -> IO () -updTbl fs_table_var (FastStringTable uid# arr#) i# ls = - IO (\ s# -> case writeArray# arr# i# ls s# of { s2# -> - (# s2#, () #) }) >> - writeIORef fs_table_var (FastStringTable (uid# +# 1#) arr#) + unsafePerformIO $ do + (STArray _ _ arr#) <- stToIO (newSTArray (0::Int,hASH_TBL_SIZE) []) + newIORef (FastStringTable 0 arr#) + +lookupTbl :: FastStringTable -> Int -> IO [FastString] +lookupTbl (FastStringTable _ arr#) (I# i#) = + IO $ \ s# -> readArray# arr# i# s# + +updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO () +updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do + (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) }) + writeIORef fs_table_var (FastStringTable (uid+1) arr#) mkFastString# :: Addr# -> FastString -mkFastString# a# = - case strLength (Ptr a#) of { (I# len#) -> mkFastStringLen# a# len# } +mkFastString# a# = mkFastStringBytes ptr (strLength ptr) + where ptr = Ptr a# -mkFastStringLen# :: Addr# -> Int# -> FastString -mkFastStringLen# a# len# = - unsafePerformIO ( - readIORef string_table >>= \ ft@(FastStringTable uid# tbl#) -> +mkFastStringBytes :: Ptr Word8 -> Int -> FastString +mkFastStringBytes ptr len = unsafePerformIO $ do + ft@(FastStringTable uid tbl#) <- readIORef string_table let - h = hashStr a# len# - in --- _trace ("hashed: "++show (I# h)) $ - lookupTbl ft h >>= \ lookup_result -> + h = hashStr ptr len + add_it ls = do + fs <- copyNewFastString uid ptr len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h case lookup_result of - [] -> - -- no match, add it to table by copying out the - -- the string into a ByteArray - -- _trace "empty bucket" $ - case copyPrefixStr a# (I# len#) of - BA barr# -> - let f_str = FastString uid# len# barr# in - updTbl string_table ft h [f_str] >> - ({- _trace ("new: " ++ show f_str) $ -} return f_str) - ls -> - -- non-empty `bucket', scan the list looking - -- entry with same length and compare byte by byte. - -- _trace ("non-empty bucket"++show ls) $ - case bucket_match ls len# a# of - Nothing -> - case copyPrefixStr a# (I# len#) of - BA barr# -> - let f_str = FastString uid# len# barr# in - updTbl string_table ft h (f_str:ls) >> - ( {- _trace ("new: " ++ show f_str) $ -} return f_str) - Just v -> {- _trace ("re-use: "++show v) $ -} return v) - where - bucket_match [] _ _ = Nothing - bucket_match (v@(FastString _ l# ba#):ls) len# a# = - if len# ==# l# && eqStrPrefix a# ba# l# then - Just v - else - bucket_match ls len# a# - bucket_match (UnicodeStr _ _ : ls) len# a# = - bucket_match ls len# a# - -mkFastSubStringBA# :: ByteArray# -> Int# -> Int# -> FastString -mkFastSubStringBA# barr# start# len# = - unsafePerformIO ( - readIORef string_table >>= \ ft@(FastStringTable uid# tbl#) -> + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +mkZFastStringBytes :: Ptr Word8 -> Int -> FastString +mkZFastStringBytes ptr len = unsafePerformIO $ do + ft@(FastStringTable uid tbl#) <- readIORef string_table let - h = hashSubStrBA barr# start# len# - in --- _trace ("hashed(b): "++show (I# h)) $ - lookupTbl ft h >>= \ lookup_result -> + h = hashStr ptr len + add_it ls = do + fs <- copyNewZFastString uid ptr len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h case lookup_result of - [] -> - -- no match, add it to table by copying out the - -- the string into a ByteArray - -- _trace "empty bucket(b)" $ - case copySubStrBA (BA barr#) (I# start#) (I# len#) of - BA ba# -> - let f_str = FastString uid# len# ba# in - updTbl string_table ft h [f_str] >> - -- _trace ("new(b): " ++ show f_str) $ - return f_str - ls -> - -- non-empty `bucket', scan the list looking - -- entry with same length and compare byte by byte. - -- _trace ("non-empty bucket(b)"++show ls) $ - case bucket_match ls start# len# barr# of - Nothing -> - case copySubStrBA (BA barr#) (I# start#) (I# len#) of - BA ba# -> - let f_str = FastString uid# len# ba# in - updTbl string_table ft h (f_str:ls) >> - -- _trace ("new(b): " ++ show f_str) $ - return f_str - Just v -> - -- _trace ("re-use(b): "++show v) $ - return v - ) - where - bucket_match [] _ _ _ = Nothing - bucket_match (v:ls) start# len# ba# = - case v of - FastString _ l# barr# -> - if len# ==# l# && eqStrPrefixBA barr# ba# start# len# then - Just v - else - bucket_match ls start# len# ba# - UnicodeStr _ _ -> bucket_match ls start# len# ba# - -mkFastStringUnicode :: [Int] -> FastString -mkFastStringUnicode s = - unsafePerformIO ( - readIORef string_table >>= \ ft@(FastStringTable uid# tbl#) -> + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference +-- between this and 'mkFastStringBytes' is that we don't have to copy +-- the bytes if the string is new to the table. +mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString +mkFastStringForeignPtr ptr fp len = do + ft@(FastStringTable uid tbl#) <- readIORef string_table +-- _trace ("hashed: "++show (I# h)) $ let - h = hashUnicode s 0# - in --- _trace ("hashed(b): "++show (I# h)) $ - lookupTbl ft h >>= \ lookup_result -> + h = hashStr ptr len + add_it ls = do + fs <- mkNewFastString uid ptr fp len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h + case lookup_result of + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v + +mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString +mkZFastStringForeignPtr ptr fp len = do + ft@(FastStringTable uid tbl#) <- readIORef string_table +-- _trace ("hashed: "++show (I# h)) $ + let + h = hashStr ptr len + add_it ls = do + fs <- mkNewZFastString uid ptr fp len + updTbl string_table ft h (fs:ls) + {- _trace ("new: " ++ show f_str) $ -} + return fs + -- + lookup_result <- lookupTbl ft h case lookup_result of - [] -> - -- no match, add it to table by copying out the - -- the string into a [Int] - let f_str = UnicodeStr uid# s in - updTbl string_table ft h [f_str] >> - -- _trace ("new(b): " ++ show f_str) $ - return f_str - ls -> - -- non-empty `bucket', scan the list looking - -- entry with same length and compare byte by byte. - -- _trace ("non-empty bucket(b)"++show ls) $ - case bucket_match ls of - Nothing -> - let f_str = UnicodeStr uid# s in - updTbl string_table ft h (f_str:ls) >> - -- _trace ("new(b): " ++ show f_str) $ - return f_str - Just v -> - -- _trace ("re-use(b): "++show v) $ - return v - ) - where - bucket_match [] = Nothing - bucket_match (v@(UnicodeStr _ s'):ls) = - if s' == s then Just v else bucket_match ls - bucket_match (FastString _ _ _ : ls) = bucket_match ls - -mkFastStringNarrow :: String -> FastString -mkFastStringNarrow str = - case packString str of { (I# len#, BA frozen#) -> - mkFastSubStringBA# frozen# 0# len# - } - {- 0-indexed array, len# == index to one beyond end of string, - i.e., (0,1) => empty string. -} + [] -> add_it [] + ls -> do + b <- bucket_match ls len ptr + case b of + Nothing -> add_it ls + Just v -> {- _trace ("re-use: "++show v) $ -} return v -mkFastString :: String -> FastString -mkFastString str = if all good str - then mkFastStringNarrow str - else mkFastStringUnicode (map ord str) - where - good c = c >= '\1' && c <= '\xFF' - -mkFastStringInt :: [Int] -> FastString -mkFastStringInt str = if all good str - then mkFastStringNarrow (map chr str) - else mkFastStringUnicode str - where - good c = c >= 1 && c <= 0xFF - -mkFastSubString :: Addr# -> Int -> Int -> FastString -mkFastSubString a# (I# start#) (I# len#) = - mkFastStringLen# (a# `plusAddr#` start#) len# -\end{code} -\begin{code} -hashStr :: Addr# -> Int# -> Int# +-- | Creates a UTF-8 encoded 'FastString' from a 'String' +mkFastString :: String -> FastString +mkFastString str = + inlinePerformIO $ do + let l = utf8EncodedLength str + buf <- mallocForeignPtrBytes l + withForeignPtr buf $ \ptr -> do + utf8EncodeString ptr str + mkFastStringForeignPtr ptr buf l + + +-- | Creates a Z-encoded 'FastString' from a 'String' +mkZFastString :: String -> FastString +mkZFastString str = + inlinePerformIO $ do + let l = Prelude.length str + buf <- mallocForeignPtrBytes l + withForeignPtr buf $ \ptr -> do + pokeCAString (castPtr ptr) str + mkZFastStringForeignPtr ptr buf l + +bucket_match [] _ _ = return Nothing +bucket_match (v@(FastString _ l _ buf _):ls) len ptr + | len == l = do + b <- cmpStringPrefix ptr buf len + if b then return (Just v) + else bucket_match ls len ptr + | otherwise = + bucket_match ls len ptr + +mkNewFastString uid ptr fp len = do + ref <- newIORef Nothing + n_chars <- countUTF8Chars ptr len + return (FastString uid len n_chars fp (UTF8Encoded ref)) + +mkNewZFastString uid ptr fp len = do + return (FastString uid len len fp ZEncoded) + + +copyNewFastString uid ptr len = do + fp <- copyBytesToForeignPtr ptr len + ref <- newIORef Nothing + n_chars <- countUTF8Chars ptr len + return (FastString uid len n_chars fp (UTF8Encoded ref)) + +copyNewZFastString uid ptr len = do + fp <- copyBytesToForeignPtr ptr len + return (FastString uid len len fp ZEncoded) + + +copyBytesToForeignPtr ptr len = do + fp <- mallocForeignPtrBytes len + withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len + return fp + +cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool +cmpStringPrefix ptr fp len = + withForeignPtr fp $ \ptr' -> do + r <- memcmp ptr ptr' len + return (r == 0) + + +hashStr :: Ptr Word8 -> Int -> Int -- use the Addr to produce a hash value between 0 & m (inclusive) -hashStr a# len# = loop 0# 0# +hashStr (Ptr a#) (I# len#) = loop 0# 0# where - loop h n | n ==# len# = h + loop h n | n ==# len# = I# h | otherwise = loop h2 (n +# 1#) where c = ord# (indexCharOffAddr# a# n) h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE# -hashSubStrBA :: ByteArray# -> Int# -> Int# -> Int# - -- use the byte array to produce a hash value between 0 & m (inclusive) -hashSubStrBA ba# start# len# = loop 0# 0# - where - loop h n | n ==# len# = h - | otherwise = loop h2 (n +# 1#) - where c = ord# (indexCharArray# ba# (start# +# n)) - h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE# +-- ----------------------------------------------------------------------------- +-- Operations -hashUnicode :: [Int] -> Int# -> Int# -hashUnicode [] h = h -hashUnicode (I# c : cs) h = hashUnicode cs ((c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#) -\end{code} +-- | Returns the length of the 'FastString' in characters +lengthFS :: FastString -> Int +lengthFS f = n_chars f -\begin{code} -cmpFS :: FastString -> FastString -> Ordering -cmpFS (UnicodeStr u1# s1) (UnicodeStr u2# s2) = if u1# ==# u2# then EQ - else compare s1 s2 -cmpFS (UnicodeStr _ s1) s2 = compare s1 (unpackIntFS s2) -cmpFS s1 (UnicodeStr _ s2) = compare (unpackIntFS s1) s2 -cmpFS (FastString u1# l1# b1#) (FastString u2# l2# b2#) = - if u1# ==# u2# then EQ else - let l# = if l1# <=# l2# then l1# else l2# in - unsafePerformIO ( - memcmp b1# b2# l# >>= \ (I# res) -> - return ( - if res <# 0# then LT - else if res ==# 0# then - if l1# ==# l2# then EQ - else if l1# <# l2# then LT else GT - else GT - )) +-- | Returns 'True' if the 'FastString' is Z-encoded +isZEncoded :: FastString -> Bool +isZEncoded fs | ZEncoded <- enc fs = True + | otherwise = False -#ifndef __HADDOCK__ -foreign import ccall unsafe "ghc_memcmp" - memcmp :: ByteArray# -> ByteArray# -> Int# -> IO Int -#endif +-- | Returns 'True' if the 'FastString' is empty +nullFS :: FastString -> Bool +nullFS f = n_bytes f == 0 + +-- | unpacks and decodes the FastString +unpackFS :: FastString -> String +unpackFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> + case enc of + ZEncoded -> peekCAStringLen (castPtr ptr,n_bytes) + UTF8Encoded _ -> utf8DecodeString ptr n_bytes + +bytesFS :: FastString -> [Word8] +bytesFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> + peekArray n_bytes ptr + +-- | returns a Z-encoded version of a 'FastString'. This might be the +-- original, if it was already Z-encoded. The first time this +-- function is applied to a particular 'FastString', the results are +-- memoized. +-- +zEncodeFS :: FastString -> FastString +zEncodeFS fs@(FastString uid n_bytes _ fp enc) = + case enc of + ZEncoded -> fs + UTF8Encoded ref -> + inlinePerformIO $ do + m <- readIORef ref + case m of + Just fs -> return fs + Nothing -> do + let efs = mkZFastString (zEncodeString (unpackFS fs)) + writeIORef ref (Just efs) + return efs + +appendFS :: FastString -> FastString -> FastString +appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2) + +concatFS :: [FastString] -> FastString +concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better + +headFS :: FastString -> Char +headFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> do + case enc of + ZEncoded -> do + w <- peek (castPtr ptr) + return (castCCharToChar w) + UTF8Encoded _ -> + return (fst (utf8DecodeChar ptr)) + +tailFS :: FastString -> FastString +tailFS (FastString _ n_bytes _ buf enc) = + inlinePerformIO $ withForeignPtr buf $ \ptr -> do + case enc of + ZEncoded -> do + return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1) + UTF8Encoded _ -> do + let (_,ptr') = utf8DecodeChar ptr + let off = ptr' `minusPtr` ptr + return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off) + +consFS :: Char -> FastString -> FastString +consFS c fs = mkFastString (c : unpackFS fs) + +uniqueOfFS :: FastString -> Int# +uniqueOfFS (FastString (I# u#) _ _ _ _) = u# + +nilFS = mkFastString "" -- ----------------------------------------------------------------------------- -- Outputting 'FastString's -#if __GLASGOW_HASKELL__ >= 504 - --- this is our own version of hPutBuf for FastStrings, because in --- 5.04+ we don't have mutable byte arrays and therefore hPutBufBA. --- The closest is hPutArray in Data.Array.IO, but that does some extra --- range checks that we want to avoid here. - -foreign import ccall unsafe "__hscore_memcpy_dst_off" - memcpy_baoff_ba :: RawBuffer -> Int -> RawBuffer -> CSize -> IO (Ptr ()) - -hPutFS handle (FastString _ l# ba#) - | l# ==# 0# = return () - | otherwise - = do wantWritableHandle "hPutFS" handle $ - \ handle_@Handle__{ haFD=fd, haBuffer=ref, haIsStream=stream } -> do - - old_buf@Buffer{ bufBuf=old_raw, bufRPtr=r, bufWPtr=w, bufSize=size } - <- readIORef ref - - let count = I# l# - raw = unsafeCoerce# ba# :: MutableByteArray# RealWorld - - -- enough room in handle buffer? - if (size - w > count) - -- There's enough room in the buffer: - -- just copy the data in and update bufWPtr. - then do memcpy_baoff_ba old_raw w raw (fromIntegral count) - writeIORef ref old_buf{ bufWPtr = w + count } - return () - - -- else, we have to flush - else do flushed_buf <- flushWriteBuffer fd stream old_buf - writeIORef ref flushed_buf - let this_buf = - Buffer{ bufBuf=raw, bufState=WriteBuffer, - bufRPtr=0, bufWPtr=count, bufSize=count } - flushWriteBuffer fd stream this_buf - return () - -#else - -hPutFS :: Handle -> FastString -> IO () -hPutFS handle (FastString _ l# ba#) - | l# ==# 0# = return () - | otherwise = do mba <- stToIO $ unsafeThawByteArray (ByteArray (bot::Int) bot ba#) - hPutBufBAFull handle mba (I# l#) - where - bot = error "hPutFS.ba" - -#endif +-- |Outputs a 'FastString' with /no decoding at all/, that is, you +-- get the actual bytes in the 'FastString' written to the 'Handle'. +hPutFS handle (FastString _ len _ fp _) + | len == 0 = return () + | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len --- ONLY here for debugging the NCG (so -ddump-stix works for string --- literals); no idea if this is really necessary. JRS, 010131 -hPutFS handle (UnicodeStr _ is) - = hPutStr handle ("(UnicodeStr " ++ show is ++ ")") +-- ToDo: we'll probably want an hPutFSLocal, or something, to output +-- in the current locale's encoding (for error messages and suchlike). -- ----------------------------------------------------------------------------- -- LitStrings, here for convenience only. @@ -492,4 +436,24 @@ type LitString = Ptr () mkLitString# :: Addr# -> LitString mkLitString# a# = Ptr a# + +foreign import ccall unsafe "ghc_strlen" + strLength :: Ptr () -> Int + +-- ----------------------------------------------------------------------------- +-- under the carpet + +-- Just like unsafePerformIO, but we inline it. +{-# INLINE inlinePerformIO #-} +inlinePerformIO :: IO a -> a +inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r + +pokeCAString :: Ptr CChar -> String -> IO () +pokeCAString ptr str = + let + go [] n = pokeElemOff ptr n 0 + go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1) + in + go str 0 + \end{code} diff --git a/ghc/compiler/utils/FastTypes.lhs b/ghc/compiler/utils/FastTypes.lhs index 9f9d903..bb92c8c 100644 --- a/ghc/compiler/utils/FastTypes.lhs +++ b/ghc/compiler/utils/FastTypes.lhs @@ -9,7 +9,7 @@ module FastTypes ( (+#), (-#), (*#), quotFastInt, negateFastInt, (==#), (<#), (<=#), (>=#), (>#), - FastBool, fastBool, isFastTrue, fastOr + FastBool, fastBool, isFastTrue, fastOr, fastAnd ) where #include "HsVersions.h" diff --git a/ghc/compiler/utils/Pretty.lhs b/ghc/compiler/utils/Pretty.lhs index 916755e..ec8f1e7 100644 --- a/ghc/compiler/utils/Pretty.lhs +++ b/ghc/compiler/utils/Pretty.lhs @@ -179,26 +179,16 @@ module Pretty ( import BufWrite import FastString -import PrimPacked ( strLength ) import GLAEXTS import Numeric (fromRat) import IO -#if __GLASGOW_HASKELL__ < 503 -import IOExts ( hPutBufFull ) -#else import System.IO ( hPutBuf ) -#endif -#if __GLASGOW_HASKELL__ < 503 -import PrelBase ( unpackCString# ) -#else import GHC.Base ( unpackCString# ) -#endif - -import PrimPacked ( Ptr(..) ) +import GHC.Ptr ( Ptr(..) ) -- Don't import Util( assertPanic ) because it makes a loop in the module structure diff --git a/ghc/compiler/utils/PrimPacked.lhs b/ghc/compiler/utils/PrimPacked.lhs deleted file mode 100644 index f2d034d..0000000 --- a/ghc/compiler/utils/PrimPacked.lhs +++ /dev/null @@ -1,265 +0,0 @@ -% -% (c) The GRASP/AQUA Project, Glasgow University, 1997-1998 -% -\section{Basic ops on packed representations} - -Some basic operations for working on packed representations of series -of bytes (character strings). Used by the interface lexer input -subsystem, mostly. - -\begin{code} -{-# OPTIONS -optc-DNON_POSIX_SOURCE #-} - -module PrimPacked ( - Ptr(..), nullPtr, plusAddr#, - BA(..), - packString, -- :: String -> (Int, BA) - unpackNBytesBA, -- :: BA -> Int -> [Char] - strLength, -- :: Ptr CChar -> Int - copyPrefixStr, -- :: Addr# -> Int -> BA - copySubStrBA, -- :: BA -> Int -> Int -> BA - eqStrPrefix, -- :: Addr# -> ByteArray# -> Int# -> Bool - eqStrPrefixBA, -- :: ByteArray# -> ByteArray# -> Int# -> Int# -> Bool - ) where - --- This #define suppresses the "import FastString" that --- HsVersions otherwise produces -#define COMPILING_FAST_STRING -#include "HsVersions.h" - -import GLAEXTS -import UNSAFE_IO ( unsafePerformIO ) - -import MONAD_ST -import Foreign - -#if __GLASGOW_HASKELL__ < 503 -import PrelST -#else -import GHC.ST -#endif - -#if __GLASGOW_HASKELL__ >= 504 -import GHC.Ptr ( Ptr(..) ) -#elif __GLASGOW_HASKELL__ >= 500 -import Ptr ( Ptr(..) ) -#endif - -#if __GLASGOW_HASKELL__ < 504 -import PrelIOBase ( IO(..) ) -#else -import GHC.IOBase ( IO(..) ) -#endif -\end{code} - -Compatibility: 4.08 didn't have the Ptr type. - -\begin{code} -#if __GLASGOW_HASKELL__ <= 408 -data Ptr a = Ptr Addr# deriving (Eq, Ord) - -nullPtr :: Ptr a -nullPtr = Ptr (int2Addr# 0#) -#endif - -#if __GLASGOW_HASKELL__ <= 500 --- plusAddr# is a primop in GHC > 5.00 -plusAddr# :: Addr# -> Int# -> Addr# -plusAddr# a# i# = int2Addr# (addr2Int# a# +# i#) -#endif -\end{code} - -Wrapper types for bytearrays - -\begin{code} -data BA = BA ByteArray# -data MBA s = MBA (MutableByteArray# s) -\end{code} - -\begin{code} -packString :: String -> (Int, BA) -packString str = (l, arr) - where - l@(I# length#) = length str - - arr = runST (do - ch_array <- new_ps_array length# - -- fill in packed string from "str" - fill_in ch_array 0# str - -- freeze the puppy: - freeze_ps_array ch_array length# - ) - - fill_in :: MBA s -> Int# -> [Char] -> ST s () - fill_in arr_in# idx [] = - return () - fill_in arr_in# idx (C# c : cs) = - write_ps_array arr_in# idx c >> - fill_in arr_in# (idx +# 1#) cs -\end{code} - -Unpacking a string - -\begin{code} -unpackNBytesBA :: BA -> Int -> [Char] -unpackNBytesBA (BA bytes) (I# len) - = unpack 0# - where - unpack nh - | nh >=# len = [] - | otherwise = C# ch : unpack (nh +# 1#) - where - ch = indexCharArray# bytes nh -\end{code} - -Copying a char string prefix into a byte array. - -\begin{code} -copyPrefixStr :: Addr# -> Int -> BA -copyPrefixStr a# len@(I# length#) = copy' length# - where - copy' length# = runST (do - {- allocate an array that will hold the string - -} - ch_array <- new_ps_array length# - {- Revert back to Haskell-only solution for the moment. - _ccall_ memcpy ch_array (A# a) len >>= \ () -> - write_ps_array ch_array length# (chr# 0#) >> - -} - -- fill in packed string from "addr" - fill_in ch_array 0# - -- freeze the puppy: - freeze_ps_array ch_array length# - ) - - fill_in :: MBA s -> Int# -> ST s () - fill_in arr_in# idx - | idx ==# length# - = return () - | otherwise - = case (indexCharOffAddr# a# idx) of { ch -> - write_ps_array arr_in# idx ch >> - fill_in arr_in# (idx +# 1#) } -\end{code} - -Copying out a substring, assume a 0-indexed string: -(and positive lengths, thank you). - -\begin{code} -#ifdef UNUSED -copySubStr :: Addr# -> Int -> Int -> BA -copySubStr a# (I# start#) length = - copyPrefixStr (a# `plusAddr#` start#) length -#endif - -copySubStrBA :: BA -> Int -> Int -> BA -copySubStrBA (BA barr#) (I# start#) len@(I# length#) = ba - where - ba = runST (do - -- allocate an array that will hold the string - ch_array <- new_ps_array length# - -- fill in packed string from "addr" - fill_in ch_array 0# - -- freeze the puppy: - freeze_ps_array ch_array length# - ) - - fill_in :: MBA s -> Int# -> ST s () - fill_in arr_in# idx - | idx ==# length# - = return () - | otherwise - = case (indexCharArray# barr# (start# +# idx)) of { ch -> - write_ps_array arr_in# idx ch >> - fill_in arr_in# (idx +# 1#) } -\end{code} - -(Very :-) ``Specialised'' versions of some CharArray things... -[Copied from PackBase; no real reason -- UGH] - -\begin{code} -new_ps_array :: Int# -> ST s (MBA s) -write_ps_array :: MBA s -> Int# -> Char# -> ST s () -freeze_ps_array :: MBA s -> Int# -> ST s BA - -#if __GLASGOW_HASKELL__ < 411 -#define NEW_BYTE_ARRAY newCharArray# -#else -#define NEW_BYTE_ARRAY newPinnedByteArray# -#endif - -new_ps_array size = ST $ \ s -> - case (NEW_BYTE_ARRAY size s) of { (# s2#, barr# #) -> - (# s2#, MBA barr# #) } - -write_ps_array (MBA barr#) n ch = ST $ \ s# -> - case writeCharArray# barr# n ch s# of { s2# -> - (# s2#, () #) } - --- same as unsafeFreezeByteArray -freeze_ps_array (MBA arr#) len# = ST $ \ s# -> - case unsafeFreezeByteArray# arr# s# of { (# s2#, frozen# #) -> - (# s2#, BA frozen# #) } -\end{code} - - -Compare two equal-length strings for equality: - -\begin{code} -eqStrPrefix :: Addr# -> ByteArray# -> Int# -> Bool -eqStrPrefix a# barr# len# = - inlinePerformIO $ do - x <- memcmp_ba a# barr# (I# len#) - return (x == 0) - -#ifdef UNUSED -eqCharStrPrefix :: Addr# -> Addr# -> Int# -> Bool -eqCharStrPrefix a1# a2# len# = - inlinePerformIO $ do - x <- memcmp a1# a2# (I# len#) - return (x == 0) -#endif - -eqStrPrefixBA :: ByteArray# -> ByteArray# -> Int# -> Int# -> Bool -eqStrPrefixBA b1# b2# start# len# = - inlinePerformIO $ do - x <- memcmp_baoff_ba b2# (I# start#) b1# (I# len#) - return (x == 0) - -#ifdef UNUSED -eqCharStrPrefixBA :: Addr# -> ByteArray# -> Int# -> Int# -> Bool -eqCharStrPrefixBA a# b2# start# len# = - inlinePerformIO $ do - x <- memcmp_baoff b2# (I# start#) a# (I# len#) - return (x == 0) -#endif -\end{code} - -\begin{code} --- Just like unsafePerformIO, but we inline it. This is safe when --- there are no side effects, and improves performance. -{-# INLINE inlinePerformIO #-} -inlinePerformIO :: IO a -> a -inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r - -#if __GLASGOW_HASKELL__ <= 408 -strLength (Ptr a#) = ghc_strlen a# -foreign import ccall unsafe "ghc_strlen" - ghc_strlen :: Addr# -> Int -#else -foreign import ccall unsafe "ghc_strlen" - strLength :: Ptr () -> Int -#endif - -foreign import ccall unsafe "ghc_memcmp" - memcmp :: Addr# -> Addr# -> Int -> IO Int - -foreign import ccall unsafe "ghc_memcmp" - memcmp_ba :: Addr# -> ByteArray# -> Int -> IO Int - -foreign import ccall unsafe "ghc_memcmp_off" - memcmp_baoff :: ByteArray# -> Int -> Addr# -> Int -> IO Int - -foreign import ccall unsafe "ghc_memcmp_off" - memcmp_baoff_ba :: ByteArray# -> Int -> ByteArray# -> Int -> IO Int -\end{code} diff --git a/ghc/compiler/utils/StringBuffer.lhs b/ghc/compiler/utils/StringBuffer.lhs index e53dbc8..e2eed88 100644 --- a/ghc/compiler/utils/StringBuffer.lhs +++ b/ghc/compiler/utils/StringBuffer.lhs @@ -6,27 +6,32 @@ Buffers for scanning string input stored in external arrays. \begin{code} +{-# OPTIONS_GHC -O #-} +-- always optimise this module, it's critical + module StringBuffer ( StringBuffer(..), -- non-abstract for vs\/HaskellService -- * Creation\/destruction - hGetStringBuffer, -- :: FilePath -> IO StringBuffer - stringToStringBuffer, -- :: String -> IO StringBuffer + hGetStringBuffer, + stringToStringBuffer, - -- * Lookup - currentChar, -- :: StringBuffer -> Char - prevChar, -- :: StringBuffer -> Char -> Char - lookAhead, -- :: StringBuffer -> Int -> Char - atEnd, -- :: StringBuffer -> Bool + -- * Inspection + nextChar, + currentChar, + prevChar, + atEnd, - -- * Moving - stepOn, stepOnBy, + -- * Moving and comparison + stepOn, + offsetBytes, + byteDiff, -- * Conversion - lexemeToString, -- :: StringBuffer -> Int -> String - lexemeToFastString, -- :: StringBuffer -> Int -> FastString + lexemeToString, + lexemeToFastString, -- * Parsing integers parseInteger, @@ -34,22 +39,19 @@ module StringBuffer #include "HsVersions.h" -import FastString -import Panic +import Encoding +import FastString (FastString,mkFastString,mkFastStringBytes) import GLAEXTS import Foreign -#if __GLASGOW_HASKELL__ < 503 -import PrelIOBase -import PrelHandle -#else -import GHC.IOBase -import GHC.IO ( slurpFile ) -#endif +import GHC.IOBase ( IO(..) ) +import GHC.Base ( unsafeChr ) + +import System.IO ( hGetBuf ) -import IO ( openFile, hFileSize, IOMode(ReadMode), +import IO ( hFileSize, IOMode(ReadMode), hClose ) #if __GLASGOW_HASKELL__ >= 601 import System.IO ( openBinaryFile ) @@ -57,37 +59,35 @@ import System.IO ( openBinaryFile ) import IOExts ( openFileEx, IOModeEx(..) ) #endif -#if __GLASGOW_HASKELL__ < 503 -import IArray ( listArray ) -import ArrayBase ( UArray(..) ) -import MutableArray -import IOExts ( hGetBufBA ) -#else -import Data.Array.IArray ( listArray ) -import Data.Array.MArray ( unsafeFreeze, newArray_ ) -import Data.Array.Base ( UArray(..) ) -import Data.Array.IO ( IOArray, hGetArray ) -#endif - -import Char ( ord ) - #if __GLASGOW_HASKELL__ < 601 openBinaryFile fp mode = openFileEx fp (BinaryMode mode) #endif + -- ----------------------------------------------------------------------------- -- The StringBuffer type --- A StringBuffer is a ByteArray# with a pointer into it. We also cache --- the length of the ByteArray# for speed. - +-- |A StringBuffer is an internal pointer to a sized chunk of bytes. +-- The bytes are intended to be *immutable*. There are pure +-- operations to read the contents of a StringBuffer. +-- +-- A StringBuffer may have a finalizer, depending on how it was +-- obtained. +-- data StringBuffer - = StringBuffer - ByteArray# - Int# -- length - Int# -- current pos + = StringBuffer { + buf :: {-# UNPACK #-} !(ForeignPtr Word8), + len :: {-# UNPACK #-} !Int, -- length + cur :: {-# UNPACK #-} !Int -- current pos + } + -- The buffer is assumed to be UTF-8 encoded, and furthermore + -- we add three '\0' bytes to the end as sentinels so that the + -- decoder doesn't have to check for overflow at every single byte + -- of a multibyte sequence. instance Show StringBuffer where - showsPrec _ s = showString "" + showsPrec _ s = showString "" -- ----------------------------------------------------------------------------- -- Creation / Destruction @@ -95,97 +95,108 @@ instance Show StringBuffer where hGetStringBuffer :: FilePath -> IO StringBuffer hGetStringBuffer fname = do h <- openBinaryFile fname ReadMode - size <- hFileSize h - let size_i@(I# sz#) = fromIntegral size -#if __GLASGOW_HASKELL__ < 503 - arr <- stToIO (newCharArray (0,size_i-1)) - r <- hGetBufBA h arr size_i -#else - arr <- newArray_ (0,size_i-1) - r <- if size_i == 0 then return 0 else hGetArray h arr size_i -#endif - hClose h - if (r /= size_i) + size_i <- hFileSize h + let size = fromIntegral size_i + buf <- mallocForeignPtrArray (size+3) + withForeignPtr buf $ \ptr -> do + r <- if size == 0 then return 0 else hGetBuf h ptr size + hClose h + if (r /= size) then ioError (userError "short read of file") else do -#if __GLASGOW_HASKELL__ < 503 - frozen <- stToIO (unsafeFreezeByteArray arr) - case frozen of - ByteArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#) -#else - frozen <- unsafeFreeze arr - case frozen of - UArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#) -#endif + pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] + -- sentinels for UTF-8 decoding + return (StringBuffer buf size 0) -#if __GLASGOW_HASKELL__ >= 502 +stringToStringBuffer :: String -> IO StringBuffer stringToStringBuffer str = do - let size@(I# sz#) = length str - arr = listArray (0,size-1) (map (fromIntegral.ord) str) - :: UArray Int Word8 - case arr of - UArray _ _ bytearr# -> return (StringBuffer bytearr# sz# 0#) -#else -stringToStringBuffer = panic "stringToStringBuffer: not implemented" -#endif + let size = utf8EncodedLength str + buf <- mallocForeignPtrArray (size+3) + withForeignPtr buf $ \ptr -> do + utf8EncodeString ptr str + pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] + -- sentinels for UTF-8 decoding + return (StringBuffer buf size 0) -- ----------------------------------------------------------------------------- --- Lookup - -currentChar :: StringBuffer -> Char -currentChar (StringBuffer arr# l# current#) = - ASSERT(current# <# l#) - C# (indexCharArray# arr# current#) +-- Grab a character + +-- Getting our fingers dirty a little here, but this is performance-critical +{-# INLINE nextChar #-} +nextChar :: StringBuffer -> (Char,StringBuffer) +nextChar (StringBuffer buf len (I# cur#)) = + inlinePerformIO $ do + withForeignPtr buf $ \(Ptr a#) -> do + case utf8DecodeChar# (a# `plusAddr#` cur#) of + (# c#, b# #) -> + let cur' = I# (b# `minusAddr#` a#) in + return (C# c#, StringBuffer buf len cur') + +currentChar :: StringBuffer -> Char +currentChar = fst . nextChar prevChar :: StringBuffer -> Char -> Char -prevChar (StringBuffer _ _ 0#) deflt = deflt -prevChar s deflt = lookAhead s (-1) - -lookAhead :: StringBuffer -> Int -> Char -lookAhead (StringBuffer arr# l# c#) (I# i#) = - ASSERT(off <# l# && off >=# 0#) - C# (indexCharArray# arr# off) - where - off = c# +# i# +prevChar (StringBuffer buf len 0) deflt = deflt +prevChar (StringBuffer buf len cur) deflt = + inlinePerformIO $ do + withForeignPtr buf $ \p -> do + p' <- utf8PrevChar (p `plusPtr` cur) + return (fst (utf8DecodeChar p')) -- ----------------------------------------------------------------------------- -- Moving stepOn :: StringBuffer -> StringBuffer -stepOn s = stepOnBy 1 s +stepOn s = snd (nextChar s) + +offsetBytes :: Int -> StringBuffer -> StringBuffer +offsetBytes i s = s { cur = cur s + i } -stepOnBy :: Int -> StringBuffer -> StringBuffer -stepOnBy (I# i#) (StringBuffer fo# l# c#) = StringBuffer fo# l# (c# +# i#) +byteDiff :: StringBuffer -> StringBuffer -> Int +byteDiff s1 s2 = cur s2 - cur s1 atEnd :: StringBuffer -> Bool -atEnd (StringBuffer _ l# c#) = l# ==# c# +atEnd (StringBuffer _ l c) = l == c -- ----------------------------------------------------------------------------- -- Conversion -lexemeToString :: StringBuffer -> Int -> String +lexemeToString :: StringBuffer -> Int {-bytes-} -> String lexemeToString _ 0 = "" -lexemeToString (StringBuffer arr# _ current#) (I# len#) = unpack current# - where - end = current# +# len# +lexemeToString (StringBuffer buf _ cur) bytes = + inlinePerformIO $ + withForeignPtr buf $ \ptr -> + utf8DecodeString (ptr `plusPtr` cur) bytes - unpack nh - | nh >=# end = [] - | otherwise = C# ch : unpack (nh +# 1#) - where - ch = indexCharArray# arr# nh - -lexemeToFastString :: StringBuffer -> Int -> FastString +lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString lexemeToFastString _ 0 = mkFastString "" -lexemeToFastString (StringBuffer fo _ current#) (I# len) = - mkFastSubStringBA# fo current# len +lexemeToFastString (StringBuffer buf _ cur) len = + inlinePerformIO $ + withForeignPtr buf $ \ptr -> + return $! mkFastStringBytes (ptr `plusPtr` cur) len -- ----------------------------------------------------------------------------- -- Parsing integer strings in various bases +byteOff :: StringBuffer -> Int -> Char +byteOff (StringBuffer buf _ cur) i = + inlinePerformIO $ withForeignPtr buf $ \ptr -> do + w <- peek (ptr `plusPtr` (cur+i)) + return (unsafeChr (fromIntegral (w::Word8))) + +-- | XXX assumes ASCII digits only parseInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer parseInteger buf len radix to_int = go 0 0 where go i x | i == len = x - | otherwise = go (i+1) (x * radix + toInteger (to_int (lookAhead buf i))) + | otherwise = go (i+1) (x * radix + toInteger (to_int (byteOff buf i))) + +-- ----------------------------------------------------------------------------- +-- under the carpet + +-- Just like unsafePerformIO, but we inline it. +{-# INLINE inlinePerformIO #-} +inlinePerformIO :: IO a -> a +inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r + \end{code} diff --git a/ghc/compiler/utils/UnicodeUtil.lhs b/ghc/compiler/utils/UnicodeUtil.lhs deleted file mode 100644 index 56e95a5..0000000 --- a/ghc/compiler/utils/UnicodeUtil.lhs +++ /dev/null @@ -1,36 +0,0 @@ -Various Unicode-related utilities. - -\begin{code} -module UnicodeUtil( - stringToUtf8, intsToUtf8 - ) where - -#include "HsVersions.h" - -import Panic ( panic ) -import Char ( chr, ord ) -\end{code} - -\begin{code} -stringToUtf8 :: String -> String -stringToUtf8 s = intsToUtf8 (map ord s) - -intsToUtf8 :: [Int] -> String -intsToUtf8 [] = "" -intsToUtf8 (c:s) - | c >= 1 && c <= 0x7F = chr c : intsToUtf8 s - | c < 0 = panic ("charToUtf8 ("++show c++")") - | c <= 0x7FF = chr (0xC0 + c `div` 0x40 ) : - chr (0x80 + c `mod` 0x40) : - intsToUtf8 s - | c <= 0xFFFF = chr (0xE0 + c `div` 0x1000 ) : - chr (0x80 + c `div` 0x40 `mod` 0x40) : - chr (0x80 + c `mod` 0x40) : - intsToUtf8 s - | c <= 0x10FFFF = chr (0xF0 + c `div` 0x40000 ) : - chr (0x80 + c `div` 0x1000 `mod` 0x40) : - chr (0x80 + c `div` 0x40 `mod` 0x40) : - chr (0x80 + c `mod` 0x40) : - intsToUtf8 s - | otherwise = panic ("charToUtf8 "++show c) -\end{code} -- 1.7.10.4