ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23
  24 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 201
  25 IMPORT_DELOOPER(Ubiq)
  26 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  27 #else
  28 import {-# SOURCE #-} CostCentre
  29 # if __GLASGOW_HASKELL__ == 202
  30 import PrelBase ( Char(..) )
  31 # endif
  32 # if __GLASGOW_HASKELL__ >= 209
  33 import Addr ( Addr(..) )
  34 import ST   ( runST )
  35 # endif
  36 #endif
  37
  38 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  39 import Demand           ( Demand(..) {- instance Read -} )
  40 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  41 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  42
  43 #if __GLASGOW_HASKELL__ >= 202
  44 import Maybes           ( MaybeErr(..) )
  45 #else
  46 import Maybes           ( Maybe(..), MaybeErr(..) )
  47 #endif
  48 import Pretty
  49
  50
  51
  52 import ErrUtils         ( Error(..) )
  53 import Outputable       ( Outputable(..), PprStyle(..) )
  54 import Util             ( nOfThem, panic )
  55
  56 import FastString
  57 import StringBuffer
  58
  59 #if __GLASGOW_HASKELL__ <= 201
  60 import PreludeGlaST
  61 #else
  62 import GlaExts
  63 #if __GLASGOW_HASKELL__ <= 209
  64 import ST ( thenST, seqST )
  65 #endif
  66 #endif
  67 \end{code}
  68
  69 %************************************************************************
  70 %*                                                                      *
  71 \subsection{Lexical categories}
  72 %*                                                                      *
  73 %************************************************************************
  74
  75 These functions test strings to see if they fit the lexical categories
  76 defined in the Haskell report.  Normally applied as in e.g. @isCon
  77 (getLocalName foo)@.
  78
  79 \begin{code}
  80 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  81  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  82
  83 isLexCon cs = isLexConId  cs || isLexConSym cs
  84 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  85
  86 isLexId  cs = isLexConId  cs || isLexVarId  cs
  87 isLexSym cs = isLexConSym cs || isLexVarSym cs
  88
  89 -------------
  90
  91 isLexConId cs
  92   | _NULL_ cs        = False
  93   | cs == SLIT("[]") = True
  94   | c  == '('        = True     -- (), (,), (,,), ...
  95   | otherwise        = isUpper c || isUpperISO c
  96   where
  97     c = _HEAD_ cs
  98
  99 isLexVarId cs
 100   | _NULL_ cs    = False
 101   | otherwise    = isLower c || isLowerISO c
 102   where
 103     c = _HEAD_ cs
 104
 105 isLexConSym cs
 106   | _NULL_ cs   = False
 107   | otherwise   = c  == ':'
 108                || cs == SLIT("->")
 109   where
 110     c = _HEAD_ cs
 111
 112 isLexVarSym cs
 113   | _NULL_ cs = False
 114   | otherwise = isSymbolASCII c
 115              || isSymbolISO c
 116   where
 117     c = _HEAD_ cs
 118
 119 -------------
 120 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 121 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 122 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 123 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 124 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 125 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 126 \end{code}
 127
 128
 129 %************************************************************************
 130 %*                                                                      *
 131 \subsection{Tuple strings -- ugh!}
 132 %*                                                                      *
 133 %************************************************************************
 134
 135 \begin{code}
 136 mkTupNameStr 0 = SLIT("()")
 137 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 138 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 139 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 140 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 141 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 142 \end{code}
 143
 144
 145
 146 %************************************************************************
 147 %*                                                                      *
 148 \subsection{Data types}
 149 %*                                                                      *
 150 %************************************************************************
 151
 152 The token data type, fairly un-interesting except from two constructors,
 153 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 154 strictness, unfolding etc) and types for id decls.
 155
 156 The Idea/Observation here is that the renamer needs to scan through
 157 all of an interface file before it can continue. But only a fraction
 158 of the information contained in the file turns out to be useful, so
 159 delaying as much as possible of the scanning and parsing of an
 160 interface file Makes Sense (Heap profiles of the compiler
 161 show at a reduction in heap usage by at least a factor of two,
 162 post-renamer).
 163
 164 Hence, the interface file lexer spots when value declarations are
 165 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 166 for the type and any other id info for that binding (unfolding, strictness
 167 etc). These constructors are applied to the result of lexing these sub-chunks.
 168
 169 The lexing of the type and id info is all done lazily, of course, so
 170 the scanning (and subsequent parsing) will be done *only* on the ids the
 171 renamer finds out that it is interested in. The rest will just be junked.
 172 Laziness, you know it makes sense :-)
 173
 174 \begin{code}
 175 data IfaceToken
 176   = ITinterface         -- keywords
 177   | ITusages
 178   | ITversions
 179   | ITexports
 180   | ITinstance_modules
 181   | ITinstances
 182   | ITfixities
 183   | ITdeclarations
 184   | ITpragmas
 185   | ITdata
 186   | ITtype
 187   | ITnewtype
 188   | ITderiving
 189   | ITclass
 190   | ITwhere
 191   | ITinstance
 192   | ITinfixl
 193   | ITinfixr
 194   | ITinfix
 195   | ITforall
 196   | ITbang              -- magic symbols
 197   | ITvbar
 198   | ITdcolon
 199   | ITcomma
 200   | ITdarrow
 201   | ITdotdot
 202   | ITequal
 203   | ITocurly
 204   | ITobrack
 205   | IToparen
 206   | ITrarrow
 207   | ITccurly
 208   | ITcbrack
 209   | ITcparen
 210   | ITsemi
 211   | ITvarid   FAST_STRING
 212   | ITconid   FAST_STRING
 213   | ITvarsym  FAST_STRING
 214   | ITconsym  FAST_STRING
 215   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 216   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 217   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 218   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 219
 220   | ITtysig StringBuffer (Maybe StringBuffer)
 221                            -- lazily return the stream of tokens for
 222                            -- the info attached to an id.
 223         -- Stuff for reading unfoldings
 224   | ITarity
 225   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 226   | ITstrict [Demand] | ITbottom
 227   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 228   | ITcoerce_in | ITcoerce_out | ITatsign
 229   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 230   | ITscc CostCentre
 231   | ITchar Char | ITstring FAST_STRING
 232   | ITinteger Integer | ITdouble Double
 233   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 234   | ITunknown String            -- Used when the lexer can't make sense of it
 235   | ITeof                               -- end of file token
 236   deriving Text -- debugging
 237
 238 instance Text CostCentre -- cheat!
 239
 240 \end{code}
 241
 242 %************************************************************************
 243 %*                                                                      *
 244 \subsection{The lexical analyser}
 245 %*                                                                      *
 246 %************************************************************************
 247
 248 \begin{code}
 249 lexIface :: (IfaceToken -> IfM a) -> IfM a
 250 lexIface cont buf =
 251  _scc_ "Lexer"
 252 -- if bufferExhausted buf then
 253 --  []
 254 -- else
 255 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 256   case currentChar# buf of
 257       -- whitespace and comments, ignore.
 258     ' '#  -> lexIface cont (stepOn buf)
 259     '\t'# -> lexIface cont (stepOn buf)
 260     '\n'# -> \line -> lexIface cont (stepOn buf) (line+1)
 261
 262 -- Numbers and comments
 263     '-'#  ->
 264       case lookAhead# buf 1# of
 265         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 266         c    ->
 267           if isDigit (C# c)
 268           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 269           else lex_id cont buf
 270
 271 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 272 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 273
 274     '('# ->
 275          case prefixMatch (stepOn buf) "..)" of
 276            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 277            Nothing ->
 278             case lookAhead# buf 1# of
 279               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 280               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 281               _    -> cont IToparen (stepOn buf)
 282
 283     '{'# -> cont ITocurly (stepOn buf)
 284     '}'# -> cont ITccurly (stepOn buf)
 285     ')'# -> cont ITcparen (stepOn buf)
 286     '['# ->
 287       case lookAhead# buf 1# of
 288         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 289         _    -> cont ITobrack (stepOn buf)
 290     ']'# -> cont ITcbrack (stepOn buf)
 291     ','# -> cont ITcomma  (stepOn buf)
 292     ':'# -> case lookAhead# buf 1# of
 293               ':'# -> cont ITdcolon (stepOnBy# buf 2#)
 294               _    -> lex_id cont (incLexeme buf)
 295     ';'#  -> cont ITsemi (stepOn buf)
 296     '\"'# -> case untilEndOfString# (stepOn buf) of
 297               buf' ->
 298                   -- the string literal does *not* include the dquotes
 299                 case lexemeToFastString buf' of
 300                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 301
 302     '\''# -> --
 303              -- untilEndOfChar# extends the current lexeme until
 304              -- it hits a non-escaped single quote. The lexeme of the
 305              -- StringBuffer returned does *not* include the closing quote,
 306              -- hence we augment the lexeme and make sure to add the
 307              -- starting quote, before `read'ing the string.
 308              --
 309              case untilEndOfChar# (stepOn buf) of
 310                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 311                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 312
 313 -- ``thingy'' form for casm
 314     '`'# ->
 315             case lookAhead# buf 1# of
 316               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 317               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 318                                                      -- scanning an id of some sort.
 319 -- Keywords
 320     '_'# ->
 321          case lookAhead# buf 1# of
 322            'S'# -> case lookAhead# buf 2# of
 323                     '_'# ->
 324                             lex_demand cont (stepOnUntil (not . isSpace)
 325                                             (stepOnBy# buf 3#)) -- past _S_
 326            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 327                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 328                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 329                                                                  -- it is a keyword.
 330            _    -> lex_keyword cont (stepOn buf)
 331
 332     '\NUL'# ->
 333             if bufferExhausted (stepOn buf) then
 334                cont ITeof buf
 335             else
 336                lex_id cont buf
 337     c ->
 338         if isDigit (C# c) then
 339            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 340         else
 341            lex_id cont buf
 342 --  where
 343 lex_comment cont buf =
 344 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 345    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 346
 347 ------------------
 348 lex_demand cont buf =
 349 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 350  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 351  where
 352    -- code snatched from Demand.lhs
 353   read_em acc buf =
 354 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 355    case currentChar# buf of
 356     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 357     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 358     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 359     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 360     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 361     ')'# -> (reverse acc, stepOn buf)
 362     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 363     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 364     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 365     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 366     _    -> (reverse acc, buf)
 367
 368   do_unpack new_or_data wrapper_unpacks acc buf
 369    = case read_em [] buf of
 370       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 371
 372 ------------------
 373 lex_scc cont buf =
 374 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 375  case currentChar# buf of
 376   '"'# ->
 377       -- YUCK^2
 378      case prefixMatch (stepOn buf) "NO_CC\"" of
 379       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 380       Nothing ->
 381        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 382         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 383         Nothing   ->
 384          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 385          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 386          Nothing   ->
 387           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 388            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 389            Nothing   ->
 390             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 391              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 392              Nothing ->
 393               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 394                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 395                Nothing ->
 396                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 397                  Just buf' ->
 398                   case untilChar# (stepOverLexeme buf') '\"'# of
 399                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 400                  Nothing ->
 401                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 402                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 403                    Nothing ->
 404                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 405                      Just buf' ->
 406                       case untilChar# (stepOverLexeme buf') '\"'# of
 407                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 408                                 (stepOn (stepOverLexeme buf''))
 409                      Nothing ->
 410                       let
 411                        match_user_cc buf =
 412                         case untilChar# buf '/'# of
 413                          buf' ->
 414                           let mod_name = lexemeToFastString buf' in
 415 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 416 --                         buf'' ->
 417 --                            let grp_name = lexemeToFastString buf'' in
 418                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 419                              buf'' ->
 420                                -- The label may contain arbitrary characters, so it
 421                                -- may have been escaped etc., hence we `read' it in to get
 422                                -- rid of these meta-chars in the string and then pack it (again.)
 423                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 424                                -- BTW, the code in this module is totally gruesome..
 425                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 426                                case reads ('"':upk_label++"\"") of
 427                                 ((cc_label,_):_) ->
 428                                     let cc_name = _PK_ cc_label in
 429                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 430                                      stepOn (stepOverLexeme buf''))
 431                                 _ ->
 432                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 433                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 434                                    stepOn (stepOverLexeme buf''))
 435                       in
 436                       case prefixMatch (stepOn buf) "CAF:" of
 437                        Just buf' ->
 438                          case match_user_cc (stepOverLexeme buf') of
 439                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 440                        Nothing ->
 441                          case match_user_cc (stepOn buf) of
 442                           (cc, buf'') -> cont (ITscc cc) buf''
 443   c -> cont (ITunknown [C# c]) (stepOn buf)
 444
 445
 446 -----------
 447 lex_num :: (IfaceToken -> IfM a) ->
 448         (Int -> Int) -> Int# -> IfM a
 449 lex_num cont minus acc# buf =
 450 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 451  case scanNumLit (I# acc#) buf of
 452      (acc',buf') ->
 453        case currentChar# buf' of
 454          '.'# ->
 455              -- this case is not optimised at all, as the
 456              -- presence of floating point numbers in interface
 457              -- files is not that common. (ToDo)
 458             case expandWhile (isDigit) (incLexeme buf') of
 459               buf'' -> -- points to first non digit char
 460                 case reads (lexemeToString buf'') of
 461                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 462          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 463
 464 --         case reads (lexemeToString buf') of
 465 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 466
 467 ------------
 468 lex_keyword cont buf =
 469 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 470  case currentChar# buf of
 471   ':'# -> case lookAhead# buf 1# of
 472             '_'# -> -- a binding, type (and other id-info) follows,
 473                     -- to make the parser ever so slightly, we push
 474                     --
 475                 lex_decl cont (stepOnBy# buf 2#)
 476             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 477   _ ->
 478     case expandWhile (is_kwd_char) buf of
 479      buf' ->
 480       let kw = lexemeToFastString buf' in
 481 --    _trace ("kw: "++lexemeToString buf') $
 482       case lookupUFM ifaceKeywordsFM kw of
 483        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 484                   (stepOverLexeme buf')
 485        Just xx -> cont xx (stepOverLexeme buf')
 486
 487 lex_decl cont buf =
 488  case doDiscard False buf of -- spin until ;; is found
 489    buf' ->
 490       {- _trace (show (lexemeToString buf')) $ -}
 491       case currentChar# buf' of
 492        '\n'# -> -- newline, no id info.
 493            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 494                 (stepOverLexeme buf')
 495        '\r'# -> -- just to be sure for those Win* boxes..
 496            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 497                 (stepOverLexeme buf')
 498        '\NUL'# ->
 499            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 500                 (stepOverLexeme buf')
 501        c     -> -- run all over the id info
 502          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 503            buf'' ->
 504                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 505                     --_trace (show (lexemeToString (decLexeme buf''))) $
 506                     let idinfo =
 507                             if opt_IgnoreIfacePragmas then
 508                                 Nothing
 509                             else
 510                                 Just (lexemeToBuffer (decLexeme buf''))
 511                         --_trace (show is) $
 512                     in
 513                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 514                         (stepOverLexeme buf'')
 515
 516 -- ToDo: hammer!
 517 is_kwd_char c@(C# c#) =
 518  isAlphanum c || -- OLD: c `elem` "_@/\\"
 519  (case c# of
 520    '_'#  -> True
 521    '@'#  -> True
 522    '/'#  -> True
 523    '\\'# -> True
 524    _     -> False)
 525
 526
 527
 528 -----------
 529 lex_cstring cont buf =
 530 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 531  case expandUntilMatch buf "\'\'" of
 532    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 533            (stepOverLexeme buf')
 534
 535 -----------
 536 lex_tuple cont module_dot buf =
 537 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 538   go 2 buf
 539   where
 540    go n buf =
 541     case currentChar# buf of
 542       ','# -> go (n+1) (stepOn buf)
 543       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 544       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 545
 546 -- Similarly ' itself is ok inside an identifier, but not at the start
 547
 548 id_arr :: _ByteArray Int
 549 id_arr =
 550  unsafePerformST (
 551   newCharArray (0,255) `thenStrictlyST` \ barr ->
 552   let
 553    loop 256# = returnStrictlyST ()
 554    loop i# =
 555     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 556        writeCharArray barr (I# i#) '\1' `seqStrictlyST`
 557        loop (i# +# 1#)
 558     else
 559        writeCharArray barr (I# i#) '\0' `seqStrictlyST`
 560        loop (i# +# 1#)
 561   in
 562   loop 0#                    `seqStrictlyST`
 563   unsafeFreezeByteArray barr)
 564
 565 is_id_char (C# c#) =
 566  let
 567   _ByteArray _ arr# = id_arr
 568  in
 569  case ord# (indexCharArray# arr# (ord# c#)) of
 570   0# -> False
 571   1# -> True
 572
 573 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 574
 575 is_sym c#=
 576  case c# of {
 577    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 578    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 579    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 580    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 581    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 582    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 583
 584 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 585
 586
 587 mod_arr :: _ByteArray Int
 588 mod_arr =
 589  unsafePerformST (
 590   newCharArray (0,255) `thenStrictlyST` \ barr ->
 591   let
 592    loop 256# = returnStrictlyST ()
 593    loop i# =
 594     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 595        writeCharArray barr (I# i#) '\1' `seqStrictlyST`
 596        loop (i# +# 1#)
 597     else
 598        writeCharArray barr (I# i#) '\0' `seqStrictlyST`
 599        loop (i# +# 1#)
 600   in
 601   loop 0#                    `seqStrictlyST`
 602   unsafeFreezeByteArray barr)
 603
 604
 605 is_mod_char (C# c#) =
 606  let
 607   _ByteArray _ arr# = mod_arr
 608  in
 609  case ord# (indexCharArray# arr# (ord# c#)) of
 610   0# -> False
 611   1# -> True
 612
 613 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 614
 615 {-
 616 lex_id cs =
 617  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 618    (xs, len, cs') ->
 619     case cs' of
 620      [] -> case xs of
 621             [] -> lex_id2 Nothing cs
 622             _  -> lex_id3 Nothing len xs cs
 623
 624      '.':cs'' ->
 625         case xs of
 626           [] -> lex_id2 Nothing cs
 627           _  ->
 628            let
 629             pk_str = _PK_ (xs::String)
 630             len = lengthPS pk_str
 631            in
 632            if len==len+1 then
 633               error "Well, I never!"
 634            else
 635               lex_id2 (Just pk_str) cs''
 636      _ -> case xs of
 637             [] -> lex_id2 Nothing cs
 638             _  -> lex_id3 Nothing len xs cs'
 639
 640 -}
 641
 642 lex_id cont buf =
 643 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 644  case expandWhile (is_mod_char) buf of
 645    buf' ->
 646     case currentChar# buf' of
 647      '.'# -> munch buf' HiFile
 648      '!'# -> munch buf' HiBootFile
 649      _    -> lex_id2 cont Nothing buf'
 650    where
 651     munch buf' hif =
 652         if not (emptyLexeme buf') then
 653 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 654            case lexemeToFastString buf' of
 655              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 656                                                  (stepOn (stepOverLexeme buf'))
 657         else
 658            lex_id2 cont Nothing buf'
 659
 660
 661 -- Dealt with the Module.part
 662 lex_id2 cont module_dot buf =
 663 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 664  case currentChar# buf of
 665
 666   '['# ->       -- Special case for []
 667     case lookAhead# buf 1# of
 668      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 669      _    -> lex_id3 cont module_dot buf
 670
 671   '('# ->       -- Special case for (,,,)
 672     case lookAhead# buf 1# of
 673      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 674      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 675      _    -> lex_id3 cont module_dot buf
 676   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 677   '-'# ->
 678      case module_dot of
 679        Nothing  -> lex_id3 cont module_dot buf
 680        Just ghc -> -- this should be "GHC" (current home of (->))
 681          case lookAhead# buf 1# of
 682           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 683                         (stepOnBy# buf 2#)
 684           _    -> lex_id3 cont module_dot buf
 685   _    -> lex_id3 cont module_dot buf
 686
 687
 688
 689 -- Dealt with [], (), : special cases
 690
 691 lex_id3 cont module_dot buf =
 692 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 693  case expandWhile (is_id_char) buf of
 694   buf' ->
 695     case module_dot of
 696      Just _ ->
 697        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 698      Nothing ->
 699        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 700          Just kwd_token -> cont kwd_token new_buf
 701          Nothing        -> cont (mk_var_token lexeme) new_buf
 702     where
 703      lexeme  = lexemeToFastString buf'
 704      new_buf = stepOverLexeme buf'
 705
 706
 707 {- OLD:
 708 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 709 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 710 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 711 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 712 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 713 -}
 714
 715 -- Dealt with [], (), : special cases
 716
 717 {-
 718 lex_id3 module_dot len_xs xs cs =
 719  case my_span' (is_id_char) cs of
 720    (xs1,len_xs1,rest) ->
 721     case module_dot of
 722      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 723      Nothing ->
 724       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 725        Just kwd_token -> kwd_token          : lexIface rest
 726        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 727     where
 728      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 729 -}
 730 mk_var_token pk_str =
 731      let
 732       f = _HEAD_ pk_str
 733      in
 734      --
 735      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 736      -- remove the second half of disjunction when using a 1.3 prelude.
 737      --
 738      if      isUpper f    then ITconid pk_str
 739      else if isLower f    then ITvarid pk_str
 740      else if f == ':'     then ITconsym pk_str
 741      else if isLowerISO f then ITvarid pk_str
 742      else if isUpperISO f then ITconid pk_str
 743      else ITvarsym pk_str
 744
 745 {-
 746     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 747                           | f == ':'              = ITconsym n
 748                           | isAlpha f             = ITvarid n
 749                           | otherwise             = ITvarsym n
 750                 where
 751                       n = _PK_ xs
 752 -}
 753
 754 end_lex_id cont Nothing token buf  = cont token buf
 755 end_lex_id cont (Just (m,hif)) token buf =
 756  case token of
 757    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 758    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 759    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 760
 761         -- Special case for ->
 762         -- "->" by itself is a special token (ITrarrow),
 763         -- but M.-> is a ITqconid
 764    ITvarsym n |  n == SLIT("->")
 765               -> cont (ITqconsym (m,n,hif))         buf
 766
 767    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 768
 769 -- ITbang can't happen here I think
 770 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 771
 772    _          -> cont (ITunknown (show token))      buf
 773
 774 ------------
 775 ifaceKeywordsFM :: UniqFM IfaceToken
 776 ifaceKeywordsFM = listToUFM $
 777       map (\ (x,y) -> (_PK_ x,y))
 778        [("/\\_",                ITbiglam)
 779        ,("@_",                  ITatsign)
 780        ,("letrec_",             ITletrec)
 781        ,("interface_",          ITinterface)
 782        ,("usages_",             ITusages)
 783        ,("versions_",           ITversions)
 784        ,("exports_",            ITexports)
 785        ,("instance_modules_",   ITinstance_modules)
 786        ,("instances_",          ITinstances)
 787        ,("fixities_",           ITfixities)
 788        ,("declarations_",       ITdeclarations)
 789        ,("pragmas_",            ITpragmas)
 790        ,("forall_",             ITforall)
 791        ,("U_",                  ITunfold False)
 792        ,("U!_",                 ITunfold True)
 793        ,("A_",                  ITarity)
 794        ,("coerce_in_",          ITcoerce_in)
 795        ,("coerce_out_",         ITcoerce_out)
 796        ,("bot_",                ITbottom)
 797        ,("integer_",            ITinteger_lit)
 798        ,("rational_",           ITrational_lit)
 799        ,("addr_",               ITaddr_lit)
 800        ,("float_",              ITfloat_lit)
 801        ,("string_",             ITstring_lit)
 802        ,("litlit_",             ITlit_lit)
 803        ,("ccall_",              ITccall (False, False))
 804        ,("ccall_GC_",           ITccall (False, True))
 805        ,("casm_",               ITccall (True,  False))
 806        ,("casm_GC_",            ITccall (True,  True))
 807        ]
 808
 809 haskellKeywordsFM = listToUFM $
 810       map (\ (x,y) -> (_PK_ x,y))
 811       [ ("data",                ITdata)
 812        ,("type",                ITtype)
 813        ,("newtype",             ITnewtype)
 814        ,("class",               ITclass)
 815        ,("where",               ITwhere)
 816        ,("instance",            ITinstance)
 817        ,("infixl",              ITinfixl)
 818        ,("infixr",              ITinfixr)
 819        ,("infix",               ITinfix)
 820        ,("case",                ITcase)
 821        ,("case#",               ITprim_case)
 822        ,("of",                  ITof)
 823        ,("in",                  ITin)
 824        ,("let",                 ITlet)
 825        ,("deriving",            ITderiving)
 826
 827        ,("->",                  ITrarrow)
 828        ,("\\",                  ITlam)
 829        ,("|",                   ITvbar)
 830        ,("!",                   ITbang)
 831        ,("=>",                  ITdarrow)
 832        ,("=",                   ITequal)
 833        ]
 834
 835
 836 -- doDiscard rips along really fast, looking for a double semicolon,
 837 -- indicating the end of the pragma we're skipping
 838 doDiscard inStr buf =
 839 -- _trace (show (C# (currentChar# buf))) $
 840  case currentChar# buf of
 841    ';'# ->
 842      if not inStr then
 843        case lookAhead# buf 1# of
 844         ';'# -> incLexeme (incLexeme buf)
 845         _    -> doDiscard inStr (incLexeme buf)
 846      else
 847        doDiscard inStr (incLexeme buf)
 848    '"'# ->
 849        let
 850         odd_slashes buf flg i# =
 851           case lookAhead# buf i# of
 852            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 853            _     -> flg
 854        in
 855        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 856          '\\'# -> -- escaping something..
 857            if odd_slashes buf True (negateInt# 2#) then
 858                -- odd number of slashes, " is escaped.
 859               doDiscard inStr (incLexeme buf)
 860            else
 861                -- even number of slashes, \ is escaped.
 862               doDiscard (not inStr) (incLexeme buf)
 863          _ -> case inStr of -- forced to avoid build-up
 864                True  -> doDiscard False (incLexeme buf)
 865                False -> doDiscard True  (incLexeme buf)
 866    _ -> doDiscard inStr (incLexeme buf)
 867
 868 \end{code}
 869
 870 begin{code}
 871 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 872 my_span p xs = go [] xs
 873   where
 874     go so_far (x:xs') | p x = go (x:so_far) xs'
 875     go so_far xs            = (reverse so_far, xs)
 876
 877 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 878 my_span' p xs = go [] 0 xs
 879   where
 880     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 881     go so_far n xs            = (reverse so_far,n, xs)
 882 end{code}
 883
 884
 885 %************************************************************************
 886 %*                                                                      *
 887 \subsection{Other utility functions
 888 %*                                                                      *
 889 %************************************************************************
 890
 891 \begin{code}
 892 type IfM a = StringBuffer -> Int -> MaybeErr a Error
 893
 894 returnIf   :: a -> IfM a
 895 returnIf a s l = Succeeded a
 896
 897 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 898 m `thenIf` k = \s l ->
 899         case m s l of
 900                 Succeeded a -> k a s l
 901                 Failed err  -> Failed err
 902
 903 happyError :: IfM a
 904 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 905
 906 -----------------------------------------------------------------
 907
 908 ifaceParseErr l toks sty
 909   = hsep [ptext SLIT("Interface-file parse error: line"), int l, ptext SLIT("toks="), text (show (take 10 toks))]
 910 \end{code}