ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23
  24 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 201
  25 IMPORT_DELOOPER(Ubiq)
  26 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  27 #else
  28 import {-# SOURCE #-} CostCentre
  29 # if __GLASGOW_HASKELL__ == 202
  30 import PrelBase ( Char(..) )
  31 # endif
  32 # if __GLASGOW_HASKELL__ >= 209
  33 import Addr ( Addr(..) )
  34 import ST   ( runST )
  35 # endif
  36 #endif
  37
  38 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  39 import Demand           ( Demand(..) {- instance Read -} )
  40 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  41 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  42
  43 #if __GLASGOW_HASKELL__ >= 202
  44 import Maybes           ( MaybeErr(..) )
  45 #else
  46 import Maybes           ( Maybe(..), MaybeErr(..) )
  47 #endif
  48 import Pretty
  49
  50
  51
  52 import ErrUtils         ( Error(..) )
  53 import Outputable       ( Outputable(..), PprStyle(..) )
  54 import Util             ( nOfThem, panic )
  55
  56 import FastString
  57 import StringBuffer
  58
  59 #if __GLASGOW_HASKELL__ <= 201
  60 import PreludeGlaST
  61 #else
  62 import GlaExts
  63 #endif
  64 \end{code}
  65
  66 %************************************************************************
  67 %*                                                                      *
  68 \subsection{Lexical categories}
  69 %*                                                                      *
  70 %************************************************************************
  71
  72 These functions test strings to see if they fit the lexical categories
  73 defined in the Haskell report.  Normally applied as in e.g. @isCon
  74 (getLocalName foo)@.
  75
  76 \begin{code}
  77 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  78  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  79
  80 isLexCon cs = isLexConId  cs || isLexConSym cs
  81 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  82
  83 isLexId  cs = isLexConId  cs || isLexVarId  cs
  84 isLexSym cs = isLexConSym cs || isLexVarSym cs
  85
  86 -------------
  87
  88 isLexConId cs
  89   | _NULL_ cs        = False
  90   | cs == SLIT("[]") = True
  91   | c  == '('        = True     -- (), (,), (,,), ...
  92   | otherwise        = isUpper c || isUpperISO c
  93   where
  94     c = _HEAD_ cs
  95
  96 isLexVarId cs
  97   | _NULL_ cs    = False
  98   | otherwise    = isLower c || isLowerISO c
  99   where
 100     c = _HEAD_ cs
 101
 102 isLexConSym cs
 103   | _NULL_ cs   = False
 104   | otherwise   = c  == ':'
 105                || cs == SLIT("->")
 106   where
 107     c = _HEAD_ cs
 108
 109 isLexVarSym cs
 110   | _NULL_ cs = False
 111   | otherwise = isSymbolASCII c
 112              || isSymbolISO c
 113   where
 114     c = _HEAD_ cs
 115
 116 -------------
 117 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 118 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 119 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 120 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 121 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 122 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 123 \end{code}
 124
 125
 126 %************************************************************************
 127 %*                                                                      *
 128 \subsection{Tuple strings -- ugh!}
 129 %*                                                                      *
 130 %************************************************************************
 131
 132 \begin{code}
 133 mkTupNameStr 0 = SLIT("()")
 134 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 135 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 136 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 137 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 138 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 139 \end{code}
 140
 141
 142
 143 %************************************************************************
 144 %*                                                                      *
 145 \subsection{Data types}
 146 %*                                                                      *
 147 %************************************************************************
 148
 149 The token data type, fairly un-interesting except from two constructors,
 150 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 151 strictness, unfolding etc) and types for id decls.
 152
 153 The Idea/Observation here is that the renamer needs to scan through
 154 all of an interface file before it can continue. But only a fraction
 155 of the information contained in the file turns out to be useful, so
 156 delaying as much as possible of the scanning and parsing of an
 157 interface file Makes Sense (Heap profiles of the compiler
 158 show at a reduction in heap usage by at least a factor of two,
 159 post-renamer).
 160
 161 Hence, the interface file lexer spots when value declarations are
 162 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 163 for the type and any other id info for that binding (unfolding, strictness
 164 etc). These constructors are applied to the result of lexing these sub-chunks.
 165
 166 The lexing of the type and id info is all done lazily, of course, so
 167 the scanning (and subsequent parsing) will be done *only* on the ids the
 168 renamer finds out that it is interested in. The rest will just be junked.
 169 Laziness, you know it makes sense :-)
 170
 171 \begin{code}
 172 data IfaceToken
 173   = ITinterface         -- keywords
 174   | ITusages
 175   | ITversions
 176   | ITexports
 177   | ITinstance_modules
 178   | ITinstances
 179   | ITfixities
 180   | ITdeclarations
 181   | ITpragmas
 182   | ITdata
 183   | ITtype
 184   | ITnewtype
 185   | ITderiving
 186   | ITclass
 187   | ITwhere
 188   | ITinstance
 189   | ITinfixl
 190   | ITinfixr
 191   | ITinfix
 192   | ITforall
 193   | ITbang              -- magic symbols
 194   | ITvbar
 195   | ITdcolon
 196   | ITcomma
 197   | ITdarrow
 198   | ITdotdot
 199   | ITequal
 200   | ITocurly
 201   | ITobrack
 202   | IToparen
 203   | ITrarrow
 204   | ITccurly
 205   | ITcbrack
 206   | ITcparen
 207   | ITsemi
 208   | ITvarid   FAST_STRING
 209   | ITconid   FAST_STRING
 210   | ITvarsym  FAST_STRING
 211   | ITconsym  FAST_STRING
 212   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 213   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 214   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 215   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 216
 217   | ITtysig StringBuffer (Maybe StringBuffer)
 218                            -- lazily return the stream of tokens for
 219                            -- the info attached to an id.
 220         -- Stuff for reading unfoldings
 221   | ITarity
 222   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 223   | ITstrict [Demand] | ITbottom
 224   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 225   | ITcoerce_in | ITcoerce_out | ITatsign
 226   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 227   | ITscc CostCentre
 228   | ITchar Char | ITstring FAST_STRING
 229   | ITinteger Integer | ITdouble Double
 230   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 231   | ITunknown String            -- Used when the lexer can't make sense of it
 232   | ITeof                               -- end of file token
 233   deriving Text -- debugging
 234
 235 instance Text CostCentre -- cheat!
 236
 237 \end{code}
 238
 239 %************************************************************************
 240 %*                                                                      *
 241 \subsection{The lexical analyser}
 242 %*                                                                      *
 243 %************************************************************************
 244
 245 \begin{code}
 246 lexIface :: (IfaceToken -> IfM a) -> IfM a
 247 lexIface cont buf =
 248  _scc_ "Lexer"
 249 -- if bufferExhausted buf then
 250 --  []
 251 -- else
 252 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 253   case currentChar# buf of
 254       -- whitespace and comments, ignore.
 255     ' '#  -> lexIface cont (stepOn buf)
 256     '\t'# -> lexIface cont (stepOn buf)
 257     '\n'# -> \line -> lexIface cont (stepOn buf) (line+1)
 258
 259 -- Numbers and comments
 260     '-'#  ->
 261       case lookAhead# buf 1# of
 262         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 263         c    ->
 264           if isDigit (C# c)
 265           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 266           else lex_id cont buf
 267
 268 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 269 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 270
 271     '('# ->
 272          case prefixMatch (stepOn buf) "..)" of
 273            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 274            Nothing ->
 275             case lookAhead# buf 1# of
 276               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 277               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 278               _    -> cont IToparen (stepOn buf)
 279
 280     '{'# -> cont ITocurly (stepOn buf)
 281     '}'# -> cont ITccurly (stepOn buf)
 282     ')'# -> cont ITcparen (stepOn buf)
 283     '['# ->
 284       case lookAhead# buf 1# of
 285         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 286         _    -> cont ITobrack (stepOn buf)
 287     ']'# -> cont ITcbrack (stepOn buf)
 288     ','# -> cont ITcomma  (stepOn buf)
 289     ':'# -> case lookAhead# buf 1# of
 290               ':'# -> cont ITdcolon (stepOnBy# buf 2#)
 291               _    -> lex_id cont (incLexeme buf)
 292     ';'#  -> cont ITsemi (stepOn buf)
 293     '\"'# -> case untilEndOfString# (stepOn buf) of
 294               buf' ->
 295                   -- the string literal does *not* include the dquotes
 296                 case lexemeToFastString buf' of
 297                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 298
 299     '\''# -> --
 300              -- untilEndOfChar# extends the current lexeme until
 301              -- it hits a non-escaped single quote. The lexeme of the
 302              -- StringBuffer returned does *not* include the closing quote,
 303              -- hence we augment the lexeme and make sure to add the
 304              -- starting quote, before `read'ing the string.
 305              --
 306              case untilEndOfChar# (stepOn buf) of
 307                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 308                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 309
 310 -- ``thingy'' form for casm
 311     '`'# ->
 312             case lookAhead# buf 1# of
 313               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 314               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 315                                                      -- scanning an id of some sort.
 316 -- Keywords
 317     '_'# ->
 318          case lookAhead# buf 1# of
 319            'S'# -> case lookAhead# buf 2# of
 320                     '_'# ->
 321                             lex_demand cont (stepOnUntil (not . isSpace)
 322                                             (stepOnBy# buf 3#)) -- past _S_
 323            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 324                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 325                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 326                                                                  -- it is a keyword.
 327            _    -> lex_keyword cont (stepOn buf)
 328
 329     '\NUL'# ->
 330             if bufferExhausted (stepOn buf) then
 331                cont ITeof buf
 332             else
 333                lex_id cont buf
 334     c ->
 335         if isDigit (C# c) then
 336            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 337         else
 338            lex_id cont buf
 339 --  where
 340 lex_comment cont buf =
 341 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 342    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 343
 344 ------------------
 345 lex_demand cont buf =
 346 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 347  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 348  where
 349    -- code snatched from Demand.lhs
 350   read_em acc buf =
 351 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 352    case currentChar# buf of
 353     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 354     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 355     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 356     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 357     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 358     ')'# -> (reverse acc, stepOn buf)
 359     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 360     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 361     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 362     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 363     _    -> (reverse acc, buf)
 364
 365   do_unpack new_or_data wrapper_unpacks acc buf
 366    = case read_em [] buf of
 367       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 368
 369 ------------------
 370 lex_scc cont buf =
 371 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 372  case currentChar# buf of
 373   '"'# ->
 374       -- YUCK^2
 375      case prefixMatch (stepOn buf) "NO_CC\"" of
 376       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 377       Nothing ->
 378        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 379         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 380         Nothing   ->
 381          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 382          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 383          Nothing   ->
 384           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 385            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 386            Nothing   ->
 387             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 388              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 389              Nothing ->
 390               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 391                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 392                Nothing ->
 393                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 394                  Just buf' ->
 395                   case untilChar# (stepOverLexeme buf') '\"'# of
 396                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 397                  Nothing ->
 398                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 399                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 400                    Nothing ->
 401                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 402                      Just buf' ->
 403                       case untilChar# (stepOverLexeme buf') '\"'# of
 404                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 405                                 (stepOn (stepOverLexeme buf''))
 406                      Nothing ->
 407                       let
 408                        match_user_cc buf =
 409                         case untilChar# buf '/'# of
 410                          buf' ->
 411                           let mod_name = lexemeToFastString buf' in
 412 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 413 --                         buf'' ->
 414 --                            let grp_name = lexemeToFastString buf'' in
 415                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 416                              buf'' ->
 417                                -- The label may contain arbitrary characters, so it
 418                                -- may have been escaped etc., hence we `read' it in to get
 419                                -- rid of these meta-chars in the string and then pack it (again.)
 420                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 421                                -- BTW, the code in this module is totally gruesome..
 422                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 423                                case reads ('"':upk_label++"\"") of
 424                                 ((cc_label,_):_) ->
 425                                     let cc_name = _PK_ cc_label in
 426                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 427                                      stepOn (stepOverLexeme buf''))
 428                                 _ ->
 429                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 430                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 431                                    stepOn (stepOverLexeme buf''))
 432                       in
 433                       case prefixMatch (stepOn buf) "CAF:" of
 434                        Just buf' ->
 435                          case match_user_cc (stepOverLexeme buf') of
 436                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 437                        Nothing ->
 438                          case match_user_cc (stepOn buf) of
 439                           (cc, buf'') -> cont (ITscc cc) buf''
 440   c -> cont (ITunknown [C# c]) (stepOn buf)
 441
 442
 443 -----------
 444 lex_num :: (IfaceToken -> IfM a) ->
 445         (Int -> Int) -> Int# -> IfM a
 446 lex_num cont minus acc# buf =
 447 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 448  case scanNumLit (I# acc#) buf of
 449      (acc',buf') ->
 450        case currentChar# buf' of
 451          '.'# ->
 452              -- this case is not optimised at all, as the
 453              -- presence of floating point numbers in interface
 454              -- files is not that common. (ToDo)
 455             case expandWhile (isDigit) (incLexeme buf') of
 456               buf'' -> -- points to first non digit char
 457                 case reads (lexemeToString buf'') of
 458                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 459          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 460
 461 --         case reads (lexemeToString buf') of
 462 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 463
 464 ------------
 465 lex_keyword cont buf =
 466 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 467  case currentChar# buf of
 468   ':'# -> case lookAhead# buf 1# of
 469             '_'# -> -- a binding, type (and other id-info) follows,
 470                     -- to make the parser ever so slightly, we push
 471                     --
 472                 lex_decl cont (stepOnBy# buf 2#)
 473             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 474   _ ->
 475     case expandWhile (is_kwd_char) buf of
 476      buf' ->
 477       let kw = lexemeToFastString buf' in
 478 --    _trace ("kw: "++lexemeToString buf') $
 479       case lookupUFM ifaceKeywordsFM kw of
 480        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 481                   (stepOverLexeme buf')
 482        Just xx -> cont xx (stepOverLexeme buf')
 483
 484 lex_decl cont buf =
 485  case doDiscard False buf of -- spin until ;; is found
 486    buf' ->
 487       {- _trace (show (lexemeToString buf')) $ -}
 488       case currentChar# buf' of
 489        '\n'# -> -- newline, no id info.
 490            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 491                 (stepOverLexeme buf')
 492        '\r'# -> -- just to be sure for those Win* boxes..
 493            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 494                 (stepOverLexeme buf')
 495        '\NUL'# ->
 496            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 497                 (stepOverLexeme buf')
 498        c     -> -- run all over the id info
 499          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 500            buf'' ->
 501                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 502                     --_trace (show (lexemeToString (decLexeme buf''))) $
 503                     let idinfo =
 504                             if opt_IgnoreIfacePragmas then
 505                                 Nothing
 506                             else
 507                                 Just (lexemeToBuffer (decLexeme buf''))
 508                         --_trace (show is) $
 509                     in
 510                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 511                         (stepOverLexeme buf'')
 512
 513 -- ToDo: hammer!
 514 is_kwd_char c@(C# c#) =
 515  isAlphanum c || -- OLD: c `elem` "_@/\\"
 516  (case c# of
 517    '_'#  -> True
 518    '@'#  -> True
 519    '/'#  -> True
 520    '\\'# -> True
 521    _     -> False)
 522
 523
 524
 525 -----------
 526 lex_cstring cont buf =
 527 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 528  case expandUntilMatch buf "\'\'" of
 529    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 530            (stepOverLexeme buf')
 531
 532 -----------
 533 lex_tuple cont module_dot buf =
 534 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 535   go 2 buf
 536   where
 537    go n buf =
 538     case currentChar# buf of
 539       ','# -> go (n+1) (stepOn buf)
 540       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 541       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 542
 543 -- Similarly ' itself is ok inside an identifier, but not at the start
 544
 545 id_arr :: _ByteArray Int
 546 id_arr =
 547  unsafePerformST (
 548   newCharArray (0,255) `thenStrictlyST` \ barr ->
 549   let
 550    loop 256# = returnStrictlyST ()
 551    loop i# =
 552     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 553        writeCharArray barr (I# i#) '\1' `seqStrictlyST`
 554        loop (i# +# 1#)
 555     else
 556        writeCharArray barr (I# i#) '\0' `seqStrictlyST`
 557        loop (i# +# 1#)
 558   in
 559   loop 0#                    `seqStrictlyST`
 560   unsafeFreezeByteArray barr)
 561
 562 is_id_char (C# c#) =
 563  let
 564   _ByteArray _ arr# = id_arr
 565  in
 566  case ord# (indexCharArray# arr# (ord# c#)) of
 567   0# -> False
 568   1# -> True
 569
 570 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 571
 572 is_sym c#=
 573  case c# of {
 574    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 575    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 576    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 577    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 578    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 579    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 580
 581 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 582
 583
 584 mod_arr :: _ByteArray Int
 585 mod_arr =
 586  unsafePerformST (
 587   newCharArray (0,255) `thenStrictlyST` \ barr ->
 588   let
 589    loop 256# = returnStrictlyST ()
 590    loop i# =
 591     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 592        writeCharArray barr (I# i#) '\1' `seqStrictlyST`
 593        loop (i# +# 1#)
 594     else
 595        writeCharArray barr (I# i#) '\0' `seqStrictlyST`
 596        loop (i# +# 1#)
 597   in
 598   loop 0#                    `seqStrictlyST`
 599   unsafeFreezeByteArray barr)
 600
 601
 602 is_mod_char (C# c#) =
 603  let
 604   _ByteArray _ arr# = mod_arr
 605  in
 606  case ord# (indexCharArray# arr# (ord# c#)) of
 607   0# -> False
 608   1# -> True
 609
 610 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 611
 612 {-
 613 lex_id cs =
 614  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 615    (xs, len, cs') ->
 616     case cs' of
 617      [] -> case xs of
 618             [] -> lex_id2 Nothing cs
 619             _  -> lex_id3 Nothing len xs cs
 620
 621      '.':cs'' ->
 622         case xs of
 623           [] -> lex_id2 Nothing cs
 624           _  ->
 625            let
 626             pk_str = _PK_ (xs::String)
 627             len = lengthPS pk_str
 628            in
 629            if len==len+1 then
 630               error "Well, I never!"
 631            else
 632               lex_id2 (Just pk_str) cs''
 633      _ -> case xs of
 634             [] -> lex_id2 Nothing cs
 635             _  -> lex_id3 Nothing len xs cs'
 636
 637 -}
 638
 639 lex_id cont buf =
 640 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 641  case expandWhile (is_mod_char) buf of
 642    buf' ->
 643     case currentChar# buf' of
 644      '.'# -> munch buf' HiFile
 645      '!'# -> munch buf' HiBootFile
 646      _    -> lex_id2 cont Nothing buf'
 647    where
 648     munch buf' hif =
 649         if not (emptyLexeme buf') then
 650 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 651            case lexemeToFastString buf' of
 652              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 653                                                  (stepOn (stepOverLexeme buf'))
 654         else
 655            lex_id2 cont Nothing buf'
 656
 657
 658 -- Dealt with the Module.part
 659 lex_id2 cont module_dot buf =
 660 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 661  case currentChar# buf of
 662
 663   '['# ->       -- Special case for []
 664     case lookAhead# buf 1# of
 665      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 666      _    -> lex_id3 cont module_dot buf
 667
 668   '('# ->       -- Special case for (,,,)
 669     case lookAhead# buf 1# of
 670      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 671      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 672      _    -> lex_id3 cont module_dot buf
 673   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 674   '-'# ->
 675      case module_dot of
 676        Nothing  -> lex_id3 cont module_dot buf
 677        Just ghc -> -- this should be "GHC" (current home of (->))
 678          case lookAhead# buf 1# of
 679           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 680                         (stepOnBy# buf 2#)
 681           _    -> lex_id3 cont module_dot buf
 682   _    -> lex_id3 cont module_dot buf
 683
 684
 685
 686 -- Dealt with [], (), : special cases
 687
 688 lex_id3 cont module_dot buf =
 689 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 690  case expandWhile (is_id_char) buf of
 691   buf' ->
 692     case module_dot of
 693      Just _ ->
 694        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 695      Nothing ->
 696        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 697          Just kwd_token -> cont kwd_token new_buf
 698          Nothing        -> cont (mk_var_token lexeme) new_buf
 699     where
 700      lexeme  = lexemeToFastString buf'
 701      new_buf = stepOverLexeme buf'
 702
 703
 704 {- OLD:
 705 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 706 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 707 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 708 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 709 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 710 -}
 711
 712 -- Dealt with [], (), : special cases
 713
 714 {-
 715 lex_id3 module_dot len_xs xs cs =
 716  case my_span' (is_id_char) cs of
 717    (xs1,len_xs1,rest) ->
 718     case module_dot of
 719      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 720      Nothing ->
 721       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 722        Just kwd_token -> kwd_token          : lexIface rest
 723        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 724     where
 725      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 726 -}
 727 mk_var_token pk_str =
 728      let
 729       f = _HEAD_ pk_str
 730      in
 731      --
 732      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 733      -- remove the second half of disjunction when using a 1.3 prelude.
 734      --
 735      if      isUpper f    then ITconid pk_str
 736      else if isLower f    then ITvarid pk_str
 737      else if f == ':'     then ITconsym pk_str
 738      else if isLowerISO f then ITvarid pk_str
 739      else if isUpperISO f then ITconid pk_str
 740      else ITvarsym pk_str
 741
 742 {-
 743     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 744                           | f == ':'              = ITconsym n
 745                           | isAlpha f             = ITvarid n
 746                           | otherwise             = ITvarsym n
 747                 where
 748                       n = _PK_ xs
 749 -}
 750
 751 end_lex_id cont Nothing token buf  = cont token buf
 752 end_lex_id cont (Just (m,hif)) token buf =
 753  case token of
 754    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 755    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 756    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 757
 758         -- Special case for ->
 759         -- "->" by itself is a special token (ITrarrow),
 760         -- but M.-> is a ITqconid
 761    ITvarsym n |  n == SLIT("->")
 762               -> cont (ITqconsym (m,n,hif))         buf
 763
 764    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 765
 766 -- ITbang can't happen here I think
 767 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 768
 769    _          -> cont (ITunknown (show token))      buf
 770
 771 ------------
 772 ifaceKeywordsFM :: UniqFM IfaceToken
 773 ifaceKeywordsFM = listToUFM $
 774       map (\ (x,y) -> (_PK_ x,y))
 775        [("/\\_",                ITbiglam)
 776        ,("@_",                  ITatsign)
 777        ,("letrec_",             ITletrec)
 778        ,("interface_",          ITinterface)
 779        ,("usages_",             ITusages)
 780        ,("versions_",           ITversions)
 781        ,("exports_",            ITexports)
 782        ,("instance_modules_",   ITinstance_modules)
 783        ,("instances_",          ITinstances)
 784        ,("fixities_",           ITfixities)
 785        ,("declarations_",       ITdeclarations)
 786        ,("pragmas_",            ITpragmas)
 787        ,("forall_",             ITforall)
 788        ,("U_",                  ITunfold False)
 789        ,("U!_",                 ITunfold True)
 790        ,("A_",                  ITarity)
 791        ,("coerce_in_",          ITcoerce_in)
 792        ,("coerce_out_",         ITcoerce_out)
 793        ,("bot_",                ITbottom)
 794        ,("integer_",            ITinteger_lit)
 795        ,("rational_",           ITrational_lit)
 796        ,("addr_",               ITaddr_lit)
 797        ,("float_",              ITfloat_lit)
 798        ,("string_",             ITstring_lit)
 799        ,("litlit_",             ITlit_lit)
 800        ,("ccall_",              ITccall (False, False))
 801        ,("ccall_GC_",           ITccall (False, True))
 802        ,("casm_",               ITccall (True,  False))
 803        ,("casm_GC_",            ITccall (True,  True))
 804        ]
 805
 806 haskellKeywordsFM = listToUFM $
 807       map (\ (x,y) -> (_PK_ x,y))
 808       [ ("data",                ITdata)
 809        ,("type",                ITtype)
 810        ,("newtype",             ITnewtype)
 811        ,("class",               ITclass)
 812        ,("where",               ITwhere)
 813        ,("instance",            ITinstance)
 814        ,("infixl",              ITinfixl)
 815        ,("infixr",              ITinfixr)
 816        ,("infix",               ITinfix)
 817        ,("case",                ITcase)
 818        ,("case#",               ITprim_case)
 819        ,("of",                  ITof)
 820        ,("in",                  ITin)
 821        ,("let",                 ITlet)
 822        ,("deriving",            ITderiving)
 823
 824        ,("->",                  ITrarrow)
 825        ,("\\",                  ITlam)
 826        ,("|",                   ITvbar)
 827        ,("!",                   ITbang)
 828        ,("=>",                  ITdarrow)
 829        ,("=",                   ITequal)
 830        ]
 831
 832
 833 -- doDiscard rips along really fast, looking for a double semicolon,
 834 -- indicating the end of the pragma we're skipping
 835 doDiscard inStr buf =
 836 -- _trace (show (C# (currentChar# buf))) $
 837  case currentChar# buf of
 838    ';'# ->
 839      if not inStr then
 840        case lookAhead# buf 1# of
 841         ';'# -> incLexeme (incLexeme buf)
 842         _    -> doDiscard inStr (incLexeme buf)
 843      else
 844        doDiscard inStr (incLexeme buf)
 845    '"'# ->
 846        let
 847         odd_slashes buf flg i# =
 848           case lookAhead# buf i# of
 849            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 850            _     -> flg
 851        in
 852        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 853          '\\'# -> -- escaping something..
 854            if odd_slashes buf True (negateInt# 2#) then
 855                -- odd number of slashes, " is escaped.
 856               doDiscard inStr (incLexeme buf)
 857            else
 858                -- even number of slashes, \ is escaped.
 859               doDiscard (not inStr) (incLexeme buf)
 860          _ -> case inStr of -- forced to avoid build-up
 861                True  -> doDiscard False (incLexeme buf)
 862                False -> doDiscard True  (incLexeme buf)
 863    _ -> doDiscard inStr (incLexeme buf)
 864
 865 \end{code}
 866
 867 begin{code}
 868 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 869 my_span p xs = go [] xs
 870   where
 871     go so_far (x:xs') | p x = go (x:so_far) xs'
 872     go so_far xs            = (reverse so_far, xs)
 873
 874 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 875 my_span' p xs = go [] 0 xs
 876   where
 877     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 878     go so_far n xs            = (reverse so_far,n, xs)
 879 end{code}
 880
 881
 882 %************************************************************************
 883 %*                                                                      *
 884 \subsection{Other utility functions
 885 %*                                                                      *
 886 %************************************************************************
 887
 888 \begin{code}
 889 type IfM a = StringBuffer -> Int -> MaybeErr a Error
 890
 891 returnIf   :: a -> IfM a
 892 returnIf a s l = Succeeded a
 893
 894 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 895 m `thenIf` k = \s l ->
 896         case m s l of
 897                 Succeeded a -> k a s l
 898                 Failed err  -> Failed err
 899
 900 happyError :: IfM a
 901 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 902
 903 -----------------------------------------------------------------
 904
 905 ifaceParseErr l toks sty
 906   = hsep [ptext SLIT("Interface-file parse error: line"), int l, ptext SLIT("toks="), text (show (take 10 toks))]
 907 \end{code}