ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 module Lex (
   8
   9         isLexCon, isLexVar, isLexId, isLexSym,
  10         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  11         mkTupNameStr, ifaceParseErr,
  12
  13         -- Monad for parser
  14         IfaceToken(..), lexIface, IfM, thenIf, returnIf, getSrcLocIf,
  15         happyError,
  16         StringBuffer
  17
  18     ) where
  19
  20 #include "HsVersions.h"
  21
  22 import Char             (isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord )
  23
  24 import {-# SOURCE #-} CostCentre
  25
  26 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  27 import Demand           ( Demand(..) {- instance Read -} )
  28 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  29 import BasicTypes       ( NewOrData(..), IfaceFlavour(..) )
  30 import SrcLoc           ( SrcLoc, incSrcLine )
  31
  32 import Maybes           ( MaybeErr(..) )
  33 import ErrUtils         ( ErrMsg(..) )
  34 import Outputable
  35 import Util             ( nOfThem, panic )
  36
  37 import FastString
  38 import StringBuffer
  39 import GlaExts
  40 import ST               ( runST )
  41 \end{code}
  42
  43 %************************************************************************
  44 %*                                                                      *
  45 \subsection{Lexical categories}
  46 %*                                                                      *
  47 %************************************************************************
  48
  49 These functions test strings to see if they fit the lexical categories
  50 defined in the Haskell report.  Normally applied as in e.g. @isCon
  51 (getLocalName foo)@.
  52
  53 \begin{code}
  54 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  55  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  56
  57 isLexCon cs = isLexConId  cs || isLexConSym cs
  58 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  59
  60 isLexId  cs = isLexConId  cs || isLexVarId  cs
  61 isLexSym cs = isLexConSym cs || isLexVarSym cs
  62
  63 -------------
  64
  65 isLexConId cs
  66   | _NULL_ cs        = False
  67   | cs == SLIT("[]") = True
  68   | c  == '('        = True     -- (), (,), (,,), ...
  69   | otherwise        = isUpper c || isUpperISO c
  70   where
  71     c = _HEAD_ cs
  72
  73 isLexVarId cs
  74   | _NULL_ cs    = False
  75   | otherwise    = isLower c || isLowerISO c
  76   where
  77     c = _HEAD_ cs
  78
  79 isLexConSym cs
  80   | _NULL_ cs   = False
  81   | otherwise   = c  == ':'
  82                || cs == SLIT("->")
  83   where
  84     c = _HEAD_ cs
  85
  86 isLexVarSym cs
  87   | _NULL_ cs = False
  88   | otherwise = isSymbolASCII c
  89              || isSymbolISO c
  90   where
  91     c = _HEAD_ cs
  92
  93 -------------
  94 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
  95 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
  96 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
  97 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
  98 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
  99 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 100 \end{code}
 101
 102
 103 %************************************************************************
 104 %*                                                                      *
 105 \subsection{Tuple strings -- ugh!}
 106 %*                                                                      *
 107 %************************************************************************
 108
 109 \begin{code}
 110 mkTupNameStr 0 = SLIT("()")
 111 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 112 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 113 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 114 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 115 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 116 \end{code}
 117
 118
 119
 120 %************************************************************************
 121 %*                                                                      *
 122 \subsection{Data types}
 123 %*                                                                      *
 124 %************************************************************************
 125
 126 The token data type, fairly un-interesting except from two constructors,
 127 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 128 strictness, unfolding etc) and types for id decls.
 129
 130 The Idea/Observation here is that the renamer needs to scan through
 131 all of an interface file before it can continue. But only a fraction
 132 of the information contained in the file turns out to be useful, so
 133 delaying as much as possible of the scanning and parsing of an
 134 interface file Makes Sense (Heap profiles of the compiler
 135 show at a reduction in heap usage by at least a factor of two,
 136 post-renamer).
 137
 138 Hence, the interface file lexer spots when value declarations are
 139 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 140 for the type and any other id info for that binding (unfolding, strictness
 141 etc). These constructors are applied to the result of lexing these sub-chunks.
 142
 143 The lexing of the type and id info is all done lazily, of course, so
 144 the scanning (and subsequent parsing) will be done *only* on the ids the
 145 renamer finds out that it is interested in. The rest will just be junked.
 146 Laziness, you know it makes sense :-)
 147
 148 \begin{code}
 149 data IfaceToken
 150   = ITinterface         -- keywords
 151   | ITusages
 152   | ITversions
 153   | ITexports
 154   | ITinstance_modules
 155   | ITinstances
 156   | ITfixities
 157   | ITdeclarations
 158   | ITpragmas
 159   | ITdata
 160   | ITtype
 161   | ITnewtype
 162   | ITderiving
 163   | ITclass
 164   | ITwhere
 165   | ITinstance
 166   | ITinfixl
 167   | ITinfixr
 168   | ITinfix
 169   | ITforall
 170   | ITbang              -- magic symbols
 171   | ITvbar
 172   | ITdcolon
 173   | ITcomma
 174   | ITdarrow
 175   | ITdotdot
 176   | ITequal
 177   | ITocurly
 178   | ITobrack
 179   | IToparen
 180   | ITrarrow
 181   | ITccurly
 182   | ITcbrack
 183   | ITcparen
 184   | ITsemi
 185   | ITvarid   FAST_STRING
 186   | ITconid   FAST_STRING
 187   | ITvarsym  FAST_STRING
 188   | ITconsym  FAST_STRING
 189   | ITqvarid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 190   | ITqconid  (FAST_STRING,FAST_STRING,IfaceFlavour)
 191   | ITqvarsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 192   | ITqconsym (FAST_STRING,FAST_STRING,IfaceFlavour)
 193
 194   | ITtysig StringBuffer (Maybe StringBuffer)
 195                            -- lazily return the stream of tokens for
 196                            -- the info attached to an id.
 197         -- Stuff for reading unfoldings
 198   | ITarity
 199   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 200   | ITstrict [Demand] | ITbottom
 201   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 202   | ITcoerce_in | ITcoerce_out | ITatsign
 203   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 204   | ITscc CostCentre
 205   | ITchar Char | ITstring FAST_STRING
 206   | ITinteger Integer | ITdouble Double
 207   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 208   | ITunknown String            -- Used when the lexer can't make sense of it
 209   | ITeof                               -- end of file token
 210   deriving Text -- debugging
 211
 212 instance Text CostCentre -- cheat!
 213
 214 \end{code}
 215
 216 %************************************************************************
 217 %*                                                                      *
 218 \subsection{The lexical analyser}
 219 %*                                                                      *
 220 %************************************************************************
 221
 222 \begin{code}
 223 lexIface :: (IfaceToken -> IfM a) -> IfM a
 224 lexIface cont buf =
 225  _scc_ "Lexer"
 226 -- if bufferExhausted buf then
 227 --  []
 228 -- else
 229 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 230   case currentChar# buf of
 231       -- whitespace and comments, ignore.
 232     ' '#  -> lexIface cont (stepOn buf)
 233     '\t'# -> lexIface cont (stepOn buf)
 234     '\n'# -> \ loc -> lexIface cont (stepOn buf) (incSrcLine loc)
 235
 236 -- Numbers and comments
 237     '-'#  ->
 238       case lookAhead# buf 1# of
 239         '-'# -> lex_comment cont (stepOnBy# buf 2#)
 240         c    ->
 241           if isDigit (C# c)
 242           then lex_num cont (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 243           else lex_id cont buf
 244
 245 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 246 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 247
 248     '('# ->
 249          case prefixMatch (stepOn buf) "..)" of
 250            Just buf' ->  cont ITdotdot (stepOverLexeme buf')
 251            Nothing ->
 252             case lookAhead# buf 1# of
 253               ','# -> lex_tuple cont Nothing  (stepOnBy# buf 2#)
 254               ')'# -> cont (ITconid SLIT("()")) (stepOnBy# buf 2#)
 255               _    -> cont IToparen (stepOn buf)
 256
 257     '{'# -> cont ITocurly (stepOn buf)
 258     '}'# -> cont ITccurly (stepOn buf)
 259     ')'# -> cont ITcparen (stepOn buf)
 260     '['# ->
 261       case lookAhead# buf 1# of
 262         ']'# -> cont (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 263         _    -> cont ITobrack (stepOn buf)
 264     ']'# -> cont ITcbrack (stepOn buf)
 265     ','# -> cont ITcomma  (stepOn buf)
 266     ';'#  -> cont ITsemi (stepOn buf)
 267     '\"'# -> case untilEndOfString# (stepOn buf) of
 268               buf' ->
 269                   -- the string literal does *not* include the dquotes
 270                 case lexemeToFastString buf' of
 271                  v -> cont (ITstring v) (stepOn (stepOverLexeme buf'))
 272
 273     '\''# -> --
 274              -- untilEndOfChar# extends the current lexeme until
 275              -- it hits a non-escaped single quote. The lexeme of the
 276              -- StringBuffer returned does *not* include the closing quote,
 277              -- hence we augment the lexeme and make sure to add the
 278              -- starting quote, before `read'ing the string.
 279              --
 280              case untilEndOfChar# (stepOn buf) of
 281                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 282                         [  (ch, rest)] -> cont (ITchar ch) (stepOverLexeme (incLexeme buf'))
 283
 284 -- ``thingy'' form for casm
 285     '`'# ->
 286             case lookAhead# buf 1# of
 287               '`'# -> lex_cstring cont (stepOnBy# buf 2#) -- remove the `` and go.
 288               _    -> lex_id cont (incLexeme buf)         -- add ` to lexeme and assume
 289                                                      -- scanning an id of some sort.
 290 -- Keywords
 291     '_'# ->
 292          case lookAhead# buf 1# of
 293            'S'# -> case lookAhead# buf 2# of
 294                     '_'# ->
 295                             lex_demand cont (stepOnUntil (not . isSpace)
 296                                             (stepOnBy# buf 3#)) -- past _S_
 297            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 298                      Just buf' -> lex_scc cont (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 299                      Nothing   -> lex_keyword cont (stepOnBy# buf 1#) -- drop the '_' and assume
 300                                                                  -- it is a keyword.
 301            _    -> lex_keyword cont (stepOn buf)
 302
 303     '\NUL'# ->
 304             if bufferExhausted (stepOn buf) then
 305                cont ITeof buf
 306             else
 307                lex_id cont buf
 308     c ->
 309         if isDigit (C# c) then
 310            lex_num cont (id) (ord# c -# ord# '0'#) (incLexeme buf)
 311         else
 312            lex_id cont buf
 313 --  where
 314 lex_comment cont buf =
 315 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 316    case untilChar# buf '\n'# of {buf' -> lexIface cont (stepOverLexeme buf')}
 317
 318 ------------------
 319 lex_demand cont buf =
 320 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 321  case read_em [] buf of { (ls,buf') -> cont (ITstrict ls) (stepOverLexeme buf')}
 322  where
 323    -- code snatched from Demand.lhs
 324   read_em acc buf =
 325 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 326    case currentChar# buf of
 327     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 328     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 329     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 330     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 331     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 332     ')'# -> (reverse acc, stepOn buf)
 333     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 334     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 335     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 336     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 337     _    -> (reverse acc, buf)
 338
 339   do_unpack new_or_data wrapper_unpacks acc buf
 340    = case read_em [] buf of
 341       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 342
 343 ------------------
 344 lex_scc cont buf =
 345 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 346  case currentChar# buf of
 347   '"'# ->
 348       -- YUCK^2
 349      case prefixMatch (stepOn buf) "NO_CC\"" of
 350       Just buf' -> cont (ITscc noCostCentre) (stepOverLexeme buf')
 351       Nothing ->
 352        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 353         Just buf' -> cont (ITscc useCurrentCostCentre) (stepOverLexeme buf')
 354         Nothing   ->
 355          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 356          Just buf' -> cont (ITscc overheadCostCentre) (stepOverLexeme buf')
 357          Nothing   ->
 358           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 359            Just buf' -> cont (ITscc dontCareCostCentre) (stepOverLexeme buf')
 360            Nothing   ->
 361             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 362              Just buf' -> cont (ITscc subsumedCosts) (stepOverLexeme buf')
 363              Nothing ->
 364               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 365                Just buf' -> cont (ITscc preludeCafsCostCentre) (stepOverLexeme buf')
 366                Nothing ->
 367                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 368                  Just buf' ->
 369                   case untilChar# (stepOverLexeme buf') '\"'# of
 370                    buf'' -> cont (ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_)) (stepOn (stepOverLexeme buf''))
 371                  Nothing ->
 372                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 373                    Just buf' -> cont (ITscc (preludeDictsCostCentre True)) (stepOverLexeme buf')
 374                    Nothing ->
 375                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 376                      Just buf' ->
 377                       case untilChar# (stepOverLexeme buf') '\"'# of
 378                        buf'' -> cont (ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True))
 379                                 (stepOn (stepOverLexeme buf''))
 380                      Nothing ->
 381                       let
 382                        match_user_cc buf =
 383                         case untilChar# buf '/'# of
 384                          buf' ->
 385                           let mod_name = lexemeToFastString buf' in
 386 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 387 --                         buf'' ->
 388 --                            let grp_name = lexemeToFastString buf'' in
 389                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 390                              buf'' ->
 391                                -- The label may contain arbitrary characters, so it
 392                                -- may have been escaped etc., hence we `read' it in to get
 393                                -- rid of these meta-chars in the string and then pack it (again.)
 394                                -- ToDo: do the same for module name (single quotes allowed in m-names).
 395                                -- BTW, the code in this module is totally gruesome..
 396                                let upk_label = _UNPK_ (lexemeToFastString buf'') in
 397                                case reads ('"':upk_label++"\"") of
 398                                 ((cc_label,_):_) ->
 399                                     let cc_name = _PK_ cc_label in
 400                                     (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 401                                      stepOn (stepOverLexeme buf''))
 402                                 _ ->
 403                                   trace ("trouble lexing scc label: " ++ upk_label ++ " , ignoring")
 404                                   (mkUserCC _NIL_ mod_name _NIL_{-grp_name-},
 405                                    stepOn (stepOverLexeme buf''))
 406                       in
 407                       case prefixMatch (stepOn buf) "CAF:" of
 408                        Just buf' ->
 409                          case match_user_cc (stepOverLexeme buf') of
 410                           (cc, buf'') -> cont (ITscc (cafifyCC cc)) buf''
 411                        Nothing ->
 412                          case match_user_cc (stepOn buf) of
 413                           (cc, buf'') -> cont (ITscc cc) buf''
 414   c -> cont (ITunknown [C# c]) (stepOn buf)
 415
 416
 417 -----------
 418 lex_num :: (IfaceToken -> IfM a) ->
 419         (Int -> Int) -> Int# -> IfM a
 420 lex_num cont minus acc# buf =
 421 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 422  case scanNumLit (I# acc#) buf of
 423      (acc',buf') ->
 424        case currentChar# buf' of
 425          '.'# ->
 426              -- this case is not optimised at all, as the
 427              -- presence of floating point numbers in interface
 428              -- files is not that common. (ToDo)
 429             case expandWhile (isDigit) (incLexeme buf') of
 430               buf'' -> -- points to first non digit char
 431                 case reads (lexemeToString buf'') of
 432                   [(v,_)] -> cont (ITdouble v) (stepOverLexeme buf'')
 433          _ -> cont (ITinteger (fromInt (minus acc'))) (stepOverLexeme buf')
 434
 435 --         case reads (lexemeToString buf') of
 436 --           [(i,_)] -> cont (ITinteger i) (stepOverLexeme buf')
 437
 438 ------------
 439 lex_keyword cont buf =
 440 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 441  case currentChar# buf of
 442   ':'# -> case lookAhead# buf 1# of
 443             '_'# -> -- a binding, type (and other id-info) follows,
 444                     -- to make the parser ever so slightly, we push
 445                     --
 446                 lex_decl cont (stepOnBy# buf 2#)
 447             v# -> cont (ITunknown (['_',':',C# v#])) (stepOnBy# buf 2#)
 448   _ ->
 449     case expandWhile (is_kwd_char) buf of
 450      buf' ->
 451       let kw = lexemeToFastString buf' in
 452 --    _trace ("kw: "++lexemeToString buf') $
 453       case lookupUFM ifaceKeywordsFM kw of
 454        Nothing -> cont (ITunknown (_UNPK_ kw)) -- (minor) sigh
 455                   (stepOverLexeme buf')
 456        Just xx -> cont xx (stepOverLexeme buf')
 457
 458 lex_decl cont buf =
 459  case doDiscard False buf of -- spin until ;; is found
 460    buf' ->
 461       {- _trace (show (lexemeToString buf')) $ -}
 462       case currentChar# buf' of
 463        '\n'# -> -- newline, no id info.
 464            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 465                 (stepOverLexeme buf')
 466        '\r'# -> -- just to be sure for those Win* boxes..
 467            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 468                 (stepOverLexeme buf')
 469        '\NUL'# ->
 470            cont (ITtysig (lexemeToBuffer (decLexeme buf')) Nothing)
 471                 (stepOverLexeme buf')
 472        c     -> -- run all over the id info
 473          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 474            buf'' ->
 475                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 476                     --_trace (show (lexemeToString (decLexeme buf''))) $
 477                     let idinfo =
 478                             if opt_IgnoreIfacePragmas then
 479                                 Nothing
 480                             else
 481                                 Just (lexemeToBuffer (decLexeme buf''))
 482                         --_trace (show is) $
 483                     in
 484                     cont (ITtysig (lexemeToBuffer (decLexeme buf')) idinfo)
 485                         (stepOverLexeme buf'')
 486
 487 -- ToDo: hammer!
 488 is_kwd_char c@(C# c#) =
 489  isAlphanum c || -- OLD: c `elem` "_@/\\"
 490  (case c# of
 491    '_'#  -> True
 492    '@'#  -> True
 493    '/'#  -> True
 494    '\\'# -> True
 495    _     -> False)
 496
 497
 498
 499 -----------
 500 lex_cstring cont buf =
 501 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 502  case expandUntilMatch buf "\'\'" of
 503    buf' -> cont (ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))))
 504            (stepOverLexeme buf')
 505
 506 -----------
 507 lex_tuple cont module_dot buf =
 508 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 509   go 2 buf
 510   where
 511    go n buf =
 512     case currentChar# buf of
 513       ','# -> go (n+1) (stepOn buf)
 514       ')'# -> end_lex_id cont module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 515       _    -> cont (ITunknown ("tuple " ++ show n)) buf
 516
 517 -- Similarly ' itself is ok inside an identifier, but not at the start
 518
 519 -- id_arr is an array of bytes, indexed by characters,
 520 -- containing 0 if the character isn't a valid character from an identifier
 521 -- and 1 if it is.  It's just a memo table for is_id_char.
 522 id_arr :: ByteArray Int
 523 id_arr =
 524  runST (
 525   newCharArray (0,255) >>= \ barr ->
 526   let
 527    loop 256# = return ()
 528    loop i# =
 529     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 530        writeCharArray barr (I# i#) '\1'         >>
 531        loop (i# +# 1#)
 532     else
 533        writeCharArray barr (I# i#) '\0'         >>
 534        loop (i# +# 1#)
 535   in
 536   loop 0#                                       >>
 537   unsafeFreezeByteArray barr)
 538
 539 is_id_char (C# c#) =
 540  let
 541   ByteArray _ arr# = id_arr
 542  in
 543  case ord# (indexCharArray# arr# (ord# c#)) of
 544   0# -> False
 545   1# -> True
 546
 547 --OLD: is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 548
 549 is_sym c# =
 550  case c# of {
 551    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 552    '#'# -> True; '$'#  -> True; '%'# -> True;
 553    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 554    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 555    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 556    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 557
 558 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 559
 560
 561 -- mod_arr is an array of bytes, indexed by characters,
 562 -- containing 0 if the character isn't a valid character from a module name,
 563 -- and 1 if it is.
 564 mod_arr :: ByteArray Int
 565 mod_arr =
 566  runST (
 567   newCharArray (0,255) >>= \ barr ->
 568   let
 569    loop 256# = return ()
 570    loop i# =
 571     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 572        writeCharArray barr (I# i#) '\1'         >>
 573        loop (i# +# 1#)
 574     else
 575        writeCharArray barr (I# i#) '\0'         >>
 576        loop (i# +# 1#)
 577   in
 578   loop 0#                                       >>
 579   unsafeFreezeByteArray barr)
 580
 581
 582 is_mod_char (C# c#) =
 583  let
 584   ByteArray _ arr# = mod_arr
 585  in
 586  case ord# (indexCharArray# arr# (ord# c#)) of
 587   0# -> False
 588   1# -> True
 589
 590 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 591
 592 lex_id cont buf =
 593 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 594  case expandWhile (is_mod_char) buf of
 595    buf' ->
 596     case currentChar# buf' of
 597      '.'# -> munch buf' HiFile
 598      '!'# -> munch buf' HiBootFile
 599      _    -> lex_id2 cont Nothing buf'
 600    where
 601     munch buf' hif =
 602         if not (emptyLexeme buf') then
 603 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 604            case lexemeToFastString buf' of
 605              l@(FastString u# l# ba#) -> lex_id2 cont (Just (FastString u# l# ba#, hif))
 606                                                  (stepOn (stepOverLexeme buf'))
 607         else
 608            lex_id2 cont Nothing buf'
 609
 610
 611 -- Dealt with the Module.part
 612 lex_id2 cont module_dot buf =
 613 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 614  case currentChar# buf of
 615
 616   '['# ->       -- Special case for []
 617     case lookAhead# buf 1# of
 618      ']'# -> end_lex_id cont module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 619      _    -> lex_id3 cont module_dot buf
 620
 621   '('# ->       -- Special case for (,,,)
 622     case lookAhead# buf 1# of
 623      ')'# -> end_lex_id cont module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 624      ','# -> lex_tuple cont module_dot (stepOnBy# buf 2#)
 625      _    -> lex_id3 cont module_dot buf
 626   ':'# -> lex_id3 cont module_dot (incLexeme buf)
 627   '-'# ->
 628      case module_dot of
 629        Nothing  -> lex_id3 cont module_dot buf
 630        Just ghc -> -- this should be "GHC" (current home of (->))
 631          case lookAhead# buf 1# of
 632           '>'# -> end_lex_id cont module_dot (ITconid SLIT("->"))
 633                         (stepOnBy# buf 2#)
 634           _    -> lex_id3 cont module_dot buf
 635   _    -> lex_id3 cont module_dot buf
 636
 637
 638
 639 -- Dealt with [], (), : special cases
 640
 641 lex_id3 cont module_dot buf =
 642 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 643  case expandWhile (is_id_char) buf of
 644   buf' ->
 645     case module_dot of
 646      Just _ ->
 647        end_lex_id cont module_dot (mk_var_token lexeme) new_buf
 648      Nothing ->
 649        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 650          Just kwd_token -> cont kwd_token new_buf
 651          Nothing        -> cont (mk_var_token lexeme) new_buf
 652     where
 653      lexeme  = lexemeToFastString buf'
 654      new_buf = stepOverLexeme buf'
 655
 656
 657 {- OLD:
 658 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 659 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 660 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 661 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 662 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 663 -}
 664
 665 -- Dealt with [], (), : special cases
 666
 667 {-
 668 lex_id3 module_dot len_xs xs cs =
 669  case my_span' (is_id_char) cs of
 670    (xs1,len_xs1,rest) ->
 671     case module_dot of
 672      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 673      Nothing ->
 674       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 675        Just kwd_token -> kwd_token          : lexIface rest
 676        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 677     where
 678      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 679 -}
 680 mk_var_token pk_str =
 681      let
 682       f = _HEAD_ pk_str
 683      in
 684      --
 685      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 686      -- remove the second half of disjunction when using a 1.3 prelude.
 687      --
 688      if      isUpper f    then ITconid pk_str
 689      else if isLower f    then ITvarid pk_str
 690      else if f == ':'     then ITconsym pk_str
 691      else if isLowerISO f then ITvarid pk_str
 692      else if isUpperISO f then ITconid pk_str
 693      else ITvarsym pk_str
 694
 695 {-
 696     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 697                           | f == ':'              = ITconsym n
 698                           | isAlpha f             = ITvarid n
 699                           | otherwise             = ITvarsym n
 700                 where
 701                       n = _PK_ xs
 702 -}
 703
 704 end_lex_id cont Nothing token buf  = cont token buf
 705 end_lex_id cont (Just (m,hif)) token buf =
 706  case token of
 707    ITconid n  -> cont (ITqconid  (m,n,hif))         buf
 708    ITvarid n  -> cont (ITqvarid  (m,n,hif))         buf
 709    ITconsym n -> cont (ITqconsym (m,n,hif))         buf
 710
 711         -- Special case for ->
 712         -- "->" by itself is a special token (ITrarrow),
 713         -- but M.-> is a ITqconid
 714    ITvarsym n |  n == SLIT("->")
 715               -> cont (ITqconsym (m,n,hif))         buf
 716
 717    ITvarsym n -> cont (ITqvarsym (m,n,hif))         buf
 718
 719 -- ITbang can't happen here I think
 720 --   ITbang     -> cont (ITqvarsym (m,SLIT("!"),hif)) buf
 721
 722    _          -> cont (ITunknown (show token))      buf
 723
 724 ------------
 725 ifaceKeywordsFM :: UniqFM IfaceToken
 726 ifaceKeywordsFM = listToUFM $
 727       map (\ (x,y) -> (_PK_ x,y))
 728        [("/\\_",                ITbiglam)
 729        ,("@_",                  ITatsign)
 730        ,("letrec_",             ITletrec)
 731        ,("interface_",          ITinterface)
 732        ,("usages_",             ITusages)
 733        ,("versions_",           ITversions)
 734        ,("exports_",            ITexports)
 735        ,("instance_modules_",   ITinstance_modules)
 736        ,("instances_",          ITinstances)
 737        ,("fixities_",           ITfixities)
 738        ,("declarations_",       ITdeclarations)
 739        ,("pragmas_",            ITpragmas)
 740        ,("forall_",             ITforall)
 741        ,("U_",                  ITunfold False)
 742        ,("U!_",                 ITunfold True)
 743        ,("A_",                  ITarity)
 744        ,("coerce_in_",          ITcoerce_in)
 745        ,("coerce_out_",         ITcoerce_out)
 746        ,("bot_",                ITbottom)
 747        ,("integer_",            ITinteger_lit)
 748        ,("rational_",           ITrational_lit)
 749        ,("addr_",               ITaddr_lit)
 750        ,("float_",              ITfloat_lit)
 751        ,("string_",             ITstring_lit)
 752        ,("litlit_",             ITlit_lit)
 753        ,("ccall_",              ITccall (False, False))
 754        ,("ccall_GC_",           ITccall (False, True))
 755        ,("casm_",               ITccall (True,  False))
 756        ,("casm_GC_",            ITccall (True,  True))
 757        ]
 758
 759 haskellKeywordsFM = listToUFM $
 760       map (\ (x,y) -> (_PK_ x,y))
 761       [ ("data",                ITdata)
 762        ,("type",                ITtype)
 763        ,("newtype",             ITnewtype)
 764        ,("class",               ITclass)
 765        ,("where",               ITwhere)
 766        ,("instance",            ITinstance)
 767        ,("infixl",              ITinfixl)
 768        ,("infixr",              ITinfixr)
 769        ,("infix",               ITinfix)
 770        ,("case",                ITcase)
 771        ,("case#",               ITprim_case)
 772        ,("of",                  ITof)
 773        ,("in",                  ITin)
 774        ,("let",                 ITlet)
 775        ,("deriving",            ITderiving)
 776
 777        ,("->",                  ITrarrow)
 778        ,("\\",                  ITlam)
 779        ,("|",                   ITvbar)
 780        ,("!",                   ITbang)
 781        ,("=>",                  ITdarrow)
 782        ,("=",                   ITequal)
 783        ,("::",                  ITdcolon)
 784        ]
 785
 786
 787 -- doDiscard rips along really fast, looking for a double semicolon,
 788 -- indicating the end of the pragma we're skipping
 789 doDiscard inStr buf =
 790 -- _trace (show (C# (currentChar# buf))) $
 791  case currentChar# buf of
 792    ';'# ->
 793      if not inStr then
 794        case lookAhead# buf 1# of
 795         ';'# -> incLexeme (incLexeme buf)
 796         _    -> doDiscard inStr (incLexeme buf)
 797      else
 798        doDiscard inStr (incLexeme buf)
 799    '"'# ->
 800        let
 801         odd_slashes buf flg i# =
 802           case lookAhead# buf i# of
 803            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 804            _     -> flg
 805        in
 806        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 807          '\\'# -> -- escaping something..
 808            if odd_slashes buf True (negateInt# 2#) then
 809                -- odd number of slashes, " is escaped.
 810               doDiscard inStr (incLexeme buf)
 811            else
 812                -- even number of slashes, \ is escaped.
 813               doDiscard (not inStr) (incLexeme buf)
 814          _ -> case inStr of -- forced to avoid build-up
 815                True  -> doDiscard False (incLexeme buf)
 816                False -> doDiscard True  (incLexeme buf)
 817    _ -> doDiscard inStr (incLexeme buf)
 818
 819 \end{code}
 820
 821 begin{code}
 822 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 823 my_span p xs = go [] xs
 824   where
 825     go so_far (x:xs') | p x = go (x:so_far) xs'
 826     go so_far xs            = (reverse so_far, xs)
 827
 828 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 829 my_span' p xs = go [] 0 xs
 830   where
 831     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 832     go so_far n xs            = (reverse so_far,n, xs)
 833 end{code}
 834
 835
 836 %************************************************************************
 837 %*                                                                      *
 838 \subsection{Other utility functions
 839 %*                                                                      *
 840 %************************************************************************
 841
 842 \begin{code}
 843 type IfM a = StringBuffer       -- Input string
 844           -> SrcLoc
 845           -> MaybeErr a ErrMsg
 846
 847 returnIf   :: a -> IfM a
 848 returnIf a s l = Succeeded a
 849
 850 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 851 m `thenIf` k = \s l ->
 852         case m s l of
 853                 Succeeded a -> k a s l
 854                 Failed err  -> Failed err
 855
 856 getSrcLocIf :: IfM SrcLoc
 857 getSrcLocIf s l = Succeeded l
 858
 859 happyError :: IfM a
 860 happyError s l = Failed (ifaceParseErr l ([]::[IfaceToken]){-Todo-})
 861
 862 -----------------------------------------------------------------
 863
 864 ifaceParseErr l toks
 865   = hsep [ppr l, ptext SLIT("Interface-file parse error;"),
 866           ptext SLIT("toks="), text (show (take 10 toks))]
 867 \end{code}