ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23
  24 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 201
  25 IMPORT_DELOOPER(Ubiq)
  26 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  27 #else
  28 import {-# SOURCE #-} CostCentre
  29 # if __GLASGOW_HASKELL__ == 202
  30 import PrelBase ( Char(..) )
  31 # endif
  32 #endif
  33
  34 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  35 import Demand           ( Demand(..) {- instance Read -} )
  36 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  37 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  38
  39 #if __GLASGOW_HASKELL__ >= 202
  40 import Maybes           ( MaybeErr(..) )
  41 #else
  42 import Maybes           ( Maybe(..), MaybeErr(..) )
  43 #endif
  44 import Pretty
  45
  46
  47
  48 import ErrUtils         ( Error(..) )
  49 import Outputable       ( Outputable(..), PprStyle(..) )
  50 import Util             ( nOfThem, panic )
  51
  52 import FastString
  53 import StringBuffer
  54
  55 #if __GLASGOW_HASKELL__ <= 201
  56 import PreludeGlaST
  57 #else
  58 import GlaExts
  59 #endif
  60 \end{code}
  61
  62 %************************************************************************
  63 %*                                                                      *
  64 \subsection{Lexical categories}
  65 %*                                                                      *
  66 %************************************************************************
  67
  68 These functions test strings to see if they fit the lexical categories
  69 defined in the Haskell report.  Normally applied as in e.g. @isCon
  70 (getLocalName foo)@.
  71
  72 \begin{code}
  73 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  74  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  75
  76 isLexCon cs = isLexConId  cs || isLexConSym cs
  77 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  78
  79 isLexId  cs = isLexConId  cs || isLexVarId  cs
  80 isLexSym cs = isLexConSym cs || isLexVarSym cs
  81
  82 -------------
  83
  84 isLexConId cs
  85   | _NULL_ cs        = False
  86   | cs == SLIT("[]") = True
  87   | c  == '('        = True     -- (), (,), (,,), ...
  88   | otherwise        = isUpper c || isUpperISO c
  89   where
  90     c = _HEAD_ cs
  91
  92 isLexVarId cs
  93   | _NULL_ cs    = False
  94   | otherwise    = isLower c || isLowerISO c
  95   where
  96     c = _HEAD_ cs
  97
  98 isLexConSym cs
  99   | _NULL_ cs   = False
 100   | otherwise   = c  == ':'
 101                || cs == SLIT("->")
 102   where
 103     c = _HEAD_ cs
 104
 105 isLexVarSym cs
 106   | _NULL_ cs = False
 107   | otherwise = isSymbolASCII c
 108              || isSymbolISO c
 109   where
 110     c = _HEAD_ cs
 111
 112 -------------
 113 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 114 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 115 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 116 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 117 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 118 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 119 \end{code}
 120
 121
 122 %************************************************************************
 123 %*                                                                      *
 124 \subsection{Tuple strings -- ugh!}
 125 %*                                                                      *
 126 %************************************************************************
 127
 128 \begin{code}
 129 mkTupNameStr 0 = SLIT("()")
 130 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 131 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 132 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 133 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 134 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 135 \end{code}
 136
 137
 138
 139 %************************************************************************
 140 %*                                                                      *
 141 \subsection{Data types}
 142 %*                                                                      *
 143 %************************************************************************
 144
 145 The token data type, fairly un-interesting except from two constructors,
 146 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 147 strictness, unfolding etc) and types for id decls.
 148
 149 The Idea/Observation here is that the renamer needs to scan through
 150 all of an interface file before it can continue. But only a fraction
 151 of the information contained in the file turns out to be useful, so
 152 delaying as much as possible of the scanning and parsing of an
 153 interface file Makes Sense (Heap profiles of the compiler
 154 show at a reduction in heap usage by at least a factor of two,
 155 post-renamer).
 156
 157 Hence, the interface file lexer spots when value declarations are
 158 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 159 for the type and any other id info for that binding (unfolding, strictness
 160 etc). These constructors are applied to the result of lexing these sub-chunks.
 161
 162 The lexing of the type and id info is all done lazily, of course, so
 163 the scanning (and subsequent parsing) will be done *only* on the ids the
 164 renamer finds out that it is interested in. The rest will just be junked.
 165 Laziness, you know it makes sense :-)
 166
 167 \begin{code}
 168 data IfaceToken
 169   = ITinterface         -- keywords
 170   | ITusages
 171   | ITversions
 172   | ITexports
 173   | ITinstance_modules
 174   | ITinstances
 175   | ITfixities
 176   | ITdeclarations
 177   | ITpragmas
 178   | ITdata
 179   | ITtype
 180   | ITnewtype
 181   | ITderiving
 182   | ITclass
 183   | ITwhere
 184   | ITinstance
 185   | ITinfixl
 186   | ITinfixr
 187   | ITinfix
 188   | ITforall
 189   | ITbang              -- magic symbols
 190   | ITvbar
 191   | ITdcolon
 192   | ITcomma
 193   | ITdarrow
 194   | ITdotdot
 195   | ITequal
 196   | ITocurly
 197   | ITobrack
 198   | IToparen
 199   | ITrarrow
 200   | ITccurly
 201   | ITcbrack
 202   | ITcparen
 203   | ITsemi
 204   | ITvarid   FAST_STRING
 205   | ITconid   FAST_STRING
 206   | ITvarsym  FAST_STRING
 207   | ITconsym  FAST_STRING
 208   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 209   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 210   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 211   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 212
 213   | ITtysig StringBuffer (Maybe StringBuffer)
 214                            -- lazily return the stream of tokens for
 215                            -- the info attached to an id.
 216         -- Stuff for reading unfoldings
 217   | ITarity
 218   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 219   | ITstrict [Demand] | ITbottom
 220   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 221   | ITcoerce_in | ITcoerce_out | ITatsign
 222   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 223   | ITscc CostCentre
 224   | ITchar Char | ITstring FAST_STRING
 225   | ITinteger Integer | ITdouble Double
 226   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 227   | ITunknown String            -- Used when the lexer can't make sense of it
 228   | ITeof                               -- end of file token
 229   deriving Text -- debugging
 230
 231 instance Text CostCentre -- cheat!
 232
 233 \end{code}
 234
 235 %************************************************************************
 236 %*                                                                      *
 237 \subsection{The lexical analyser}
 238 %*                                                                      *
 239 %************************************************************************
 240
 241 \begin{code}
 242 lexIface :: (IfaceToken -> IfM a) -> IfM a
 243 lexIface cont buf =
 244  _scc_ "Lexer"
 245 -- if bufferExhausted buf then
 246 --  []
 247 -- else
 248 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 249   case currentChar# buf of
 250       -- whitespace and comments, ignore.
 251     ' '#  -> lexIface cont (stepOn buf)
 252     '\t'# -> lexIface cont (stepOn buf)
 253     '\n'# -> \line -> lexIface cont (stepOn buf) (line+1)
 254
 255 -- Numbers and comments
 256     '-'#  ->
 257       case lookAhead# buf 1# of
 258         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 259         c    ->
 260           if isDigit (C# c)
 261           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 262           else lex_id cont buf
 263
 264 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 265 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 266
 267     '('# ->
 268          case prefixMatch (stepOn buf) "..)" of
 269            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 270            Nothing ->
 271             case lookAhead# buf 1# of
 272               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 273               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 274               _    -> cont IToparen (stepOn buf)
 275
 276     '{'# -> cont ITocurly (stepOn buf)
 277     '}'# -> cont ITccurly (stepOn buf)
 278     ')'# -> cont ITcparen (stepOn buf)
 279     '['# ->
 280       case lookAhead# buf 1# of
 281         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 282         _    -> cont ITobrack (stepOn buf)
 283     ']'# -> cont ITcbrack (stepOn buf)
 284     ','# -> cont ITcomma  (stepOn buf)
 285     ':'# -> case lookAhead# buf 1# of
 286               ':'# -> cont ITdcolon (stepOnBy# buf 2#)
 287               _    -> lex_id cont (incLexeme buf)
 288     ';'#  -> cont ITsemi (stepOn buf)
 289     '\"'# -> case untilEndOfString# (stepOn buf) of
 290               buf' ->
 291                   -- the string literal does *not* include the dquotes
 292                 case lexemeToFastString buf' of
 293                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 294
 295     '\''# -> --
 296              -- untilEndOfChar# extends the current lexeme until
 297              -- it hits a non-escaped single quote. The lexeme of the
 298              -- StringBuffer returned does *not* include the closing quote,
 299              -- hence we augment the lexeme and make sure to add the
 300              -- starting quote, before `read'ing the string.
 301              --
 302              case untilEndOfChar# (stepOn buf) of
 303                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 304                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 305
 306 -- ``thingy'' form for casm
 307     '`'# ->
 308             case lookAhead# buf 1# of
 309               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 310               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 311                                                      -- scanning an id of some sort.
 312 -- Keywords
 313     '_'# ->
 314          case lookAhead# buf 1# of
 315            'S'# -> case lookAhead# buf 2# of
 316                     '_'# ->
 317                             lex_demand cont (stepOnUntil (not . isSpace)
 318                                             (stepOnBy# buf 3#)) -- past _S_
 319            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 320                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 321                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 322                                                                  -- it is a keyword.
 323            _    -> lex_keyword cont (stepOn buf)
 324
 325     '\NUL'# ->
 326             if bufferExhausted (stepOn buf) then
 327                cont ITeof buf
 328             else
 329                lex_id cont buf
 330     c ->
 331         if isDigit (C# c) then
 332            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 333         else
 334            lex_id cont buf
 335 --  where
 336 lex_comment cont buf =
 337 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 338    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 339
 340 ------------------
 341 lex_demand cont buf =
 342 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 343  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 344  where
 345    -- code snatched from Demand.lhs
 346   read_em acc buf =
 347 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 348    case currentChar# buf of
 349     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 350     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 351     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 352     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 353     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 354     ')'# -> (reverse acc, stepOn buf)
 355     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 356     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 357     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 358     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 359     _    -> (reverse acc, buf)
 360
 361   do_unpack new_or_data wrapper_unpacks acc buf
 362    = case read_em [] buf of
 363       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 364
 365 ------------------
 366 lex_scc cont buf =
 367 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 368  case currentChar# buf of
 369   '"'# ->
 370       -- YUCK^2
 371      case prefixMatch (stepOn buf) "NO_CC\"" of
 372       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 373       Nothing ->
 374        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 375         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 376         Nothing   ->
 377          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 378          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 379          Nothing   ->
 380           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 381            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 382            Nothing   ->
 383             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 384              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 385              Nothing ->
 386               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 387                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 388                Nothing ->
 389                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 390                  Just buf' ->
 391                   case untilChar# (stepOverLexeme buf') '\"'# of
 392                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 393                  Nothing ->
 394                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 395                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 396                    Nothing ->
 397                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 398                      Just buf' ->
 399                       case untilChar# (stepOverLexeme buf') '\"'# of
 400                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 401                                 (stepOn (stepOverLexeme buf''))
 402                      Nothing ->
 403                       let
 404                        match_user_cc buf =
 405                         case untilChar# buf '/'# of
 406                          buf' ->
 407                           let mod_name = lexemeToFastString buf' in
 408 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 409 --                         buf'' ->
 410 --                            let grp_name = lexemeToFastString buf'' in
 411                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 412                              buf'' ->
 413                                -- The label may contain arbitrary characters, so it
 414                                -- may have been escaped etc., hence we `read' it in to get
 415                                -- rid of these meta-chars in the string and then pack it (again.)
 416                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 417                                -- BTW, the code in this module is totally gruesome..
 418                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 419                                case reads ('"':upk_label++"\"") of
 420                                 ((cc_label,_):_) ->
 421                                     let cc_name = _PK_ cc_label in
 422                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 423                                      stepOn (stepOverLexeme buf''))
 424                                 _ ->
 425                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 426                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 427                                    stepOn (stepOverLexeme buf''))
 428                       in
 429                       case prefixMatch (stepOn buf) "CAF:" of
 430                        Just buf' ->
 431                          case match_user_cc (stepOverLexeme buf') of
 432                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 433                        Nothing ->
 434                          case match_user_cc (stepOn buf) of
 435                           (cc, buf'') -> cont (ITscc cc) buf''
 436   c -> cont (ITunknown [C# c]) (stepOn buf)
 437
 438
 439 -----------
 440 lex_num :: (IfaceToken -> IfM a) ->
 441         (Int -> Int) -> Int# -> IfM a
 442 lex_num cont minus acc# buf =
 443 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 444  case scanNumLit (I# acc#) buf of
 445      (acc',buf') ->
 446        case currentChar# buf' of
 447          '.'# ->
 448              -- this case is not optimised at all, as the
 449              -- presence of floating point numbers in interface
 450              -- files is not that common. (ToDo)
 451             case expandWhile (isDigit) (incLexeme buf') of
 452               buf'' -> -- points to first non digit char
 453                 case reads (lexemeToString buf'') of
 454                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 455          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 456
 457 --         case reads (lexemeToString buf') of
 458 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 459
 460 ------------
 461 lex_keyword cont buf =
 462 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 463  case currentChar# buf of
 464   ':'# -> case lookAhead# buf 1# of
 465             '_'# -> -- a binding, type (and other id-info) follows,
 466                     -- to make the parser ever so slightly, we push
 467                     --
 468                 lex_decl cont (stepOnBy# buf 2#)
 469             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 470   _ ->
 471     case expandWhile (is_kwd_char) buf of
 472      buf' ->
 473       let kw = lexemeToFastString buf' in
 474 --    _trace ("kw: "++lexemeToString buf') $
 475       case lookupUFM ifaceKeywordsFM kw of
 476        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 477                   (stepOverLexeme buf')
 478        Just xx -> cont xx (stepOverLexeme buf')
 479
 480 lex_decl cont buf =
 481  case doDiscard False buf of -- spin until ;; is found
 482    buf' ->
 483       {- _trace (show (lexemeToString buf')) $ -}
 484       case currentChar# buf' of
 485        '\n'# -> -- newline, no id info.
 486            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 487                 (stepOverLexeme buf')
 488        '\r'# -> -- just to be sure for those Win* boxes..
 489            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 490                 (stepOverLexeme buf')
 491        '\NUL'# ->
 492            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 493                 (stepOverLexeme buf')
 494        c     -> -- run all over the id info
 495          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 496            buf'' ->
 497                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 498                     --_trace (show (lexemeToString (decLexeme buf''))) $
 499                     let idinfo =
 500                             if opt_IgnoreIfacePragmas then
 501                                 Nothing
 502                             else
 503                                 Just (lexemeToBuffer (decLexeme buf''))
 504                         --_trace (show is) $
 505                     in
 506                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 507                         (stepOverLexeme buf'')
 508
 509 -- ToDo: hammer!
 510 is_kwd_char c@(C# c#) =
 511  isAlphanum c || -- OLD: c `elem` "_@/\\"
 512  (case c# of
 513    '_'#  -> True
 514    '@'#  -> True
 515    '/'#  -> True
 516    '\\'# -> True
 517    _     -> False)
 518
 519
 520
 521 -----------
 522 lex_cstring cont buf =
 523 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 524  case expandUntilMatch buf "\'\'" of
 525    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 526            (stepOverLexeme buf')
 527
 528 -----------
 529 lex_tuple cont module_dot buf =
 530 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 531   go 2 buf
 532   where
 533    go n buf =
 534     case currentChar# buf of
 535       ','# -> go (n+1) (stepOn buf)
 536       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 537       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 538
 539 -- Similarly ' itself is ok inside an identifier, but not at the start
 540
 541 id_arr :: _ByteArray Int
 542 id_arr =
 543  unsafePerformPrimIO (
 544   newCharArray (0,255) `thenPrimIO` \ barr ->
 545   let
 546    loop 256# = returnPrimIO ()
 547    loop i# =
 548     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 549        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 550        loop (i# +# 1#)
 551     else
 552        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 553        loop (i# +# 1#)
 554   in
 555   loop 0#                    `seqPrimIO`
 556   unsafeFreezeByteArray barr)
 557
 558 is_id_char (C# c#) =
 559  let
 560   _ByteArray _ arr# = id_arr
 561  in
 562  case ord# (indexCharArray# arr# (ord# c#)) of
 563   0# -> False
 564   1# -> True
 565
 566 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 567
 568 is_sym c#=
 569  case c# of {
 570    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 571    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 572    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 573    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 574    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 575    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 576
 577 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 578
 579
 580 mod_arr :: _ByteArray Int
 581 mod_arr =
 582  unsafePerformPrimIO (
 583   newCharArray (0,255) `thenPrimIO` \ barr ->
 584   let
 585    loop 256# = returnPrimIO ()
 586    loop i# =
 587     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 588        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 589        loop (i# +# 1#)
 590     else
 591        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 592        loop (i# +# 1#)
 593   in
 594   loop 0#                    `seqPrimIO`
 595   unsafeFreezeByteArray barr)
 596
 597
 598 is_mod_char (C# c#) =
 599  let
 600   _ByteArray _ arr# = mod_arr
 601  in
 602  case ord# (indexCharArray# arr# (ord# c#)) of
 603   0# -> False
 604   1# -> True
 605
 606 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 607
 608 {-
 609 lex_id cs =
 610  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 611    (xs, len, cs') ->
 612     case cs' of
 613      [] -> case xs of
 614             [] -> lex_id2 Nothing cs
 615             _  -> lex_id3 Nothing len xs cs
 616
 617      '.':cs'' ->
 618         case xs of
 619           [] -> lex_id2 Nothing cs
 620           _  ->
 621            let
 622             pk_str = _PK_ (xs::String)
 623             len = lengthPS pk_str
 624            in
 625            if len==len+1 then
 626               error "Well, I never!"
 627            else
 628               lex_id2 (Just pk_str) cs''
 629      _ -> case xs of
 630             [] -> lex_id2 Nothing cs
 631             _  -> lex_id3 Nothing len xs cs'
 632
 633 -}
 634
 635 lex_id cont buf =
 636 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 637  case expandWhile (is_mod_char) buf of
 638    buf' ->
 639     case currentChar# buf' of
 640      '.'# -> munch buf' HiFile
 641      '!'# -> munch buf' HiBootFile
 642      _    -> lex_id2 cont Nothing buf'
 643    where
 644     munch buf' hif =
 645         if not (emptyLexeme buf') then
 646 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 647            case lexemeToFastString buf' of
 648              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 649                                                  (stepOn (stepOverLexeme buf'))
 650         else
 651            lex_id2 cont Nothing buf'
 652
 653
 654 -- Dealt with the Module.part
 655 lex_id2 cont module_dot buf =
 656 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 657  case currentChar# buf of
 658   '['# ->
 659     case lookAhead# buf 1# of
 660      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 661      _    -> lex_id3 cont module_dot buf
 662   '('# ->
 663     case lookAhead# buf 1# of
 664      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 665      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 666      _    -> lex_id3 cont module_dot buf
 667   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 668   _    -> lex_id3 cont module_dot buf
 669
 670
 671
 672 -- Dealt with [], (), : special cases
 673
 674 lex_id3 cont module_dot buf =
 675 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 676  case expandWhile (is_id_char) buf of
 677   buf' ->
 678     case module_dot of
 679      Just _ ->
 680        end_lex_id cont module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 681      Nothing ->
 682        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 683          Just kwd_token -> cont kwd_token new_buf
 684          Nothing        -> cont (mk_var_token lexeme) new_buf
 685     where
 686      lexeme  = lexemeToFastString buf'
 687      new_buf = stepOverLexeme buf'
 688
 689
 690 {- OLD:
 691 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 692 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 693 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 694 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 695 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 696 -}
 697
 698 -- Dealt with [], (), : special cases
 699
 700 {-
 701 lex_id3 module_dot len_xs xs cs =
 702  case my_span' (is_id_char) cs of
 703    (xs1,len_xs1,rest) ->
 704     case module_dot of
 705      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 706      Nothing ->
 707       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 708        Just kwd_token -> kwd_token          : lexIface rest
 709        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 710     where
 711      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 712 -}
 713 mk_var_token pk_str =
 714      let
 715       f = _HEAD_ pk_str
 716      in
 717      --
 718      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 719      -- remove the second half of disjunction when using a 1.3 prelude.
 720      --
 721      if      isUpper f    then ITconid pk_str
 722      else if isLower f    then ITvarid pk_str
 723      else if f == ':'     then ITconsym pk_str
 724      else if isLowerISO f then ITvarid pk_str
 725      else if isUpperISO f then ITconid pk_str
 726      else ITvarsym pk_str
 727
 728 {-
 729     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 730                           | f == ':'              = ITconsym n
 731                           | isAlpha f             = ITvarid n
 732                           | otherwise             = ITvarsym n
 733                 where
 734                       n = _PK_ xs
 735 -}
 736
 737 end_lex_id cont Nothing token buf  = cont token buf
 738 end_lex_id cont (Just (m,hif)) token buf =
 739  case token of
 740    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 741    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 742    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 743    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 744    ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 745    _          -> cont (ITunknown (show token))      buf
 746
 747 ------------
 748 ifaceKeywordsFM :: UniqFM IfaceToken
 749 ifaceKeywordsFM = listToUFM $
 750       map (\ (x,y) -> (_PK_ x,y))
 751        [("/\\_",                ITbiglam)
 752        ,("@_",                  ITatsign)
 753        ,("letrec_",             ITletrec)
 754        ,("interface_",          ITinterface)
 755        ,("usages_",             ITusages)
 756        ,("versions_",           ITversions)
 757        ,("exports_",            ITexports)
 758        ,("instance_modules_",   ITinstance_modules)
 759        ,("instances_",          ITinstances)
 760        ,("fixities_",           ITfixities)
 761        ,("declarations_",       ITdeclarations)
 762        ,("pragmas_",            ITpragmas)
 763        ,("forall_",             ITforall)
 764        ,("U_",                  ITunfold False)
 765        ,("U!_",                 ITunfold True)
 766        ,("A_",                  ITarity)
 767        ,("coerce_in_",          ITcoerce_in)
 768        ,("coerce_out_",         ITcoerce_out)
 769        ,("bot_",                ITbottom)
 770        ,("integer_",            ITinteger_lit)
 771        ,("rational_",           ITrational_lit)
 772        ,("addr_",               ITaddr_lit)
 773        ,("float_",              ITfloat_lit)
 774        ,("string_",             ITstring_lit)
 775        ,("litlit_",             ITlit_lit)
 776        ,("ccall_",              ITccall (False, False))
 777        ,("ccall_GC_",           ITccall (False, True))
 778        ,("casm_",               ITccall (True,  False))
 779        ,("casm_GC_",            ITccall (True,  True))
 780        ]
 781
 782 haskellKeywordsFM = listToUFM $
 783       map (\ (x,y) -> (_PK_ x,y))
 784       [ ("data",                ITdata)
 785        ,("type",                ITtype)
 786        ,("newtype",             ITnewtype)
 787        ,("class",               ITclass)
 788        ,("where",               ITwhere)
 789        ,("instance",            ITinstance)
 790        ,("infixl",              ITinfixl)
 791        ,("infixr",              ITinfixr)
 792        ,("infix",               ITinfix)
 793        ,("case",                ITcase)
 794        ,("case#",               ITprim_case)
 795        ,("of",                  ITof)
 796        ,("in",                  ITin)
 797        ,("let",                 ITlet)
 798        ,("deriving",            ITderiving)
 799
 800        ,("->",                  ITrarrow)
 801        ,("\\",                  ITlam)
 802        ,("|",                   ITvbar)
 803        ,("!",                   ITbang)
 804        ,("=>",                  ITdarrow)
 805        ,("=",                   ITequal)
 806        ]
 807
 808
 809 -- doDiscard rips along really fast looking for a double semicolon,
 810 -- indicating the end of the pragma we're skipping
 811 doDiscard inStr buf =
 812 -- _trace (show (C# (currentChar# buf))) $
 813  case currentChar# buf of
 814    ';'# ->
 815      if not inStr then
 816        case lookAhead# buf 1# of
 817         ';'# -> incLexeme (incLexeme buf)
 818         _    -> doDiscard inStr (incLexeme buf)
 819      else
 820        doDiscard inStr (incLexeme buf)
 821    '"'# ->
 822        let
 823         odd_slashes buf flg i# =
 824           case lookAhead# buf i# of
 825            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 826            _     -> flg
 827        in
 828        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 829          '\\'# -> -- escaping something..
 830            if odd_slashes buf True (negateInt# 2#) then
 831                -- odd number of slashes, " is escaped.
 832               doDiscard inStr (incLexeme buf)
 833            else
 834                -- even number of slashes, \ is escaped.
 835               doDiscard (not inStr) (incLexeme buf)
 836          _ -> case inStr of -- forced to avoid build-up
 837                True  -> doDiscard False (incLexeme buf)
 838                False -> doDiscard True  (incLexeme buf)
 839    _ -> doDiscard inStr (incLexeme buf)
 840
 841 \end{code}
 842
 843 begin{code}
 844 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 845 my_span p xs = go [] xs
 846   where
 847     go so_far (x:xs') | p x = go (x:so_far) xs'
 848     go so_far xs            = (reverse so_far, xs)
 849
 850 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 851 my_span' p xs = go [] 0 xs
 852   where
 853     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 854     go so_far n xs            = (reverse so_far,n, xs)
 855 end{code}
 856
 857
 858 %************************************************************************
 859 %*                                                                      *
 860 \subsection{Other utility functions
 861 %*                                                                      *
 862 %************************************************************************
 863
 864 \begin{code}
 865 type IfM a = StringBuffer -> Int -> MaybeErr a Error
 866
 867 returnIf   :: a -> IfM a
 868 returnIf a s l = Succeeded a
 869
 870 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 871 m `thenIf` k = \s l ->
 872         case m s l of
 873                 Succeeded a -> k a s l
 874                 Failed err  -> Failed err
 875
 876 happyError :: IfM a
 877 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 878
 879 -----------------------------------------------------------------
 880
 881 ifaceParseErr l toks sty
 882   = hsep [ptext SLIT("Interface-file parse error: line"), int l, ptext SLIT("toks="), text (show (take 10 toks))]
 883 \end{code}