ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23 IMPORT_DELOOPER(Ubiq)
  24
  25 #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 201
  26 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  27 #else
  28 import {-# SOURCE #-} CostCentre
  29 #endif
  30
  31 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  32 import Demand           ( Demand(..) {- instance Read -} )
  33 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  34 import BasicTypes       ( NewOrData(..) )
  35
  36 #if __GLASGOW_HASKELL__ >= 202
  37 import Maybes           ( MaybeErr(..) )
  38 #else
  39 import Maybes           ( Maybe(..), MaybeErr(..) )
  40 #endif
  41 import Pretty
  42
  43
  44
  45 import ErrUtils         ( Error(..) )
  46 import Outputable       ( Outputable(..), PprStyle(..) )
  47 import Util             ( nOfThem, panic )
  48
  49 import FastString
  50 import StringBuffer
  51
  52 #if __GLASGOW_HASKELL__ <= 201
  53 import PreludeGlaST
  54 #else
  55 import GlaExts
  56 #endif
  57 \end{code}
  58
  59 %************************************************************************
  60 %*                                                                      *
  61 \subsection{Lexical categories}
  62 %*                                                                      *
  63 %************************************************************************
  64
  65 These functions test strings to see if they fit the lexical categories
  66 defined in the Haskell report.  Normally applied as in e.g. @isCon
  67 (getLocalName foo)@.
  68
  69 \begin{code}
  70 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  71  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  72
  73 isLexCon cs = isLexConId  cs || isLexConSym cs
  74 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  75
  76 isLexId  cs = isLexConId  cs || isLexVarId  cs
  77 isLexSym cs = isLexConSym cs || isLexVarSym cs
  78
  79 -------------
  80
  81 isLexConId cs
  82   | _NULL_ cs        = False
  83   | cs == SLIT("[]") = True
  84   | c  == '('        = True     -- (), (,), (,,), ...
  85   | otherwise        = isUpper c || isUpperISO c
  86   where
  87     c = _HEAD_ cs
  88
  89 isLexVarId cs
  90   | _NULL_ cs    = False
  91   | otherwise    = isLower c || isLowerISO c
  92   where
  93     c = _HEAD_ cs
  94
  95 isLexConSym cs
  96   | _NULL_ cs   = False
  97   | otherwise   = c  == ':'
  98                || cs == SLIT("->")
  99   where
 100     c = _HEAD_ cs
 101
 102 isLexVarSym cs
 103   | _NULL_ cs = False
 104   | otherwise = isSymbolASCII c
 105              || isSymbolISO c
 106   where
 107     c = _HEAD_ cs
 108
 109 -------------
 110 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 111 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 112 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 113 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 114 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 115 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 116 \end{code}
 117
 118
 119 %************************************************************************
 120 %*                                                                      *
 121 \subsection{Tuple strings -- ugh!}
 122 %*                                                                      *
 123 %************************************************************************
 124
 125 \begin{code}
 126 mkTupNameStr 0 = SLIT("()")
 127 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 128 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 129 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 130 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 131 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 132 \end{code}
 133
 134
 135
 136 %************************************************************************
 137 %*                                                                      *
 138 \subsection{Data types}
 139 %*                                                                      *
 140 %************************************************************************
 141
 142 The token data type, fairly un-interesting except from two constructors,
 143 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 144 strictness, unfolding etc) and types for id decls.
 145
 146 The Idea/Observation here is that the renamer needs to scan through
 147 all of an interface file before it can continue. But only a fraction
 148 of the information contained in the file turns out to be useful, so
 149 delaying as much as possible of the scanning and parsing of an
 150 interface file Makes Sense (Heap profiles of the compiler
 151 show at a reduction in heap usage by at least a factor of two,
 152 post-renamer).
 153
 154 Hence, the interface file lexer spots when value declarations are
 155 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 156 for the type and any other id info for that binding (unfolding, strictness
 157 etc). These constructors are applied to the result of lexing these sub-chunks.
 158
 159 The lexing of the type and id info is all done lazily, of course, so
 160 the scanning (and subsequent parsing) will be done *only* on the ids the
 161 renamer finds out that it is interested in. The rest will just be junked.
 162 Laziness, you know it makes sense :-)
 163
 164 \begin{code}
 165 data IfaceToken
 166   = ITinterface         -- keywords
 167   | ITusages
 168   | ITversions
 169   | ITexports
 170   | ITinstance_modules
 171   | ITinstances
 172   | ITfixities
 173   | ITdeclarations
 174   | ITpragmas
 175   | ITdata
 176   | ITtype
 177   | ITnewtype
 178   | ITderiving
 179   | ITclass
 180   | ITwhere
 181   | ITinstance
 182   | ITinfixl
 183   | ITinfixr
 184   | ITinfix
 185   | ITforall
 186   | ITbang              -- magic symbols
 187   | ITvbar
 188   | ITdcolon
 189   | ITcomma
 190   | ITdarrow
 191   | ITdotdot
 192   | ITequal
 193   | ITocurly
 194   | ITobrack
 195   | IToparen
 196   | ITrarrow
 197   | ITccurly
 198   | ITcbrack
 199   | ITcparen
 200   | ITsemi
 201   | ITvarid   FAST_STRING
 202   | ITconid   FAST_STRING
 203   | ITvarsym  FAST_STRING
 204   | ITconsym  FAST_STRING
 205   | ITqvarid  (FAST_STRING,FAST_STRING)
 206   | ITqconid  (FAST_STRING,FAST_STRING)
 207   | ITqvarsym (FAST_STRING,FAST_STRING)
 208   | ITqconsym (FAST_STRING,FAST_STRING)
 209
 210   | ITidinfo [IfaceToken]  -- lazily return the stream of tokens for
 211                            -- the info attached to an id.
 212   | ITtysig [IfaceToken]   -- lazily return the stream of tokens for
 213                            -- the info attached to an id.
 214         -- Stuff for reading unfoldings
 215   | ITarity | ITstrict
 216   | ITunfold Bool               -- True <=> there's an INLINE pragma on this Id
 217   | ITdemand [Demand] | ITbottom
 218   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 219   | ITcoerce_in | ITcoerce_out | ITatsign
 220   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 221   | ITscc CostCentre
 222   | ITchar Char | ITstring FAST_STRING
 223   | ITinteger Integer | ITdouble Double
 224   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 225   | ITunknown String            -- Used when the lexer can't make sense of it
 226   deriving Text -- debugging
 227
 228 instance Text CostCentre -- cheat!
 229
 230 \end{code}
 231
 232 %************************************************************************
 233 %*                                                                      *
 234 \subsection{The lexical analyser}
 235 %*                                                                      *
 236 %************************************************************************
 237
 238 \begin{code}
 239 lexIface :: StringBuffer -> [IfaceToken]
 240 lexIface buf =
 241  _scc_ "Lexer"
 242 -- if bufferExhausted buf then
 243 --  []
 244 -- else
 245 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 246   case currentChar# buf of
 247       -- whitespace and comments, ignore.
 248     ' '#  -> lexIface (stepOn buf)
 249     '\t'# -> lexIface (stepOn buf)
 250     '\n'# -> lexIface (stepOn buf)
 251
 252 -- Numbers and comments
 253     '-'#  ->
 254       case lookAhead# buf 1# of
 255         '-'# -> lex_comment (stepOnBy# buf 2#)
 256         c    ->
 257           if isDigit (C# c)
 258           then lex_num (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 259           else lex_id buf
 260
 261 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 262 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 263
 264     '('# ->
 265          case prefixMatch (stepOn buf) "..)" of
 266            Just buf' ->  ITdotdot : lexIface (stepOverLexeme buf')
 267            Nothing ->
 268             case lookAhead# buf 1# of
 269               ','# -> lex_tuple Nothing  (stepOnBy# buf 2#)
 270               ')'# -> ITconid SLIT("()") : lexIface (stepOnBy# buf 2#)
 271               _    -> IToparen : lexIface (stepOn buf)
 272
 273     '{'# -> ITocurly : lexIface (stepOn buf)
 274     '}'# -> ITccurly : lexIface (stepOn buf)
 275     ')'# -> ITcparen : lexIface (stepOn buf)
 276     '['# ->
 277       case lookAhead# buf 1# of
 278         ']'# -> ITconid SLIT("[]") : lexIface (stepOnBy# buf 2#)
 279         _    -> ITobrack : lexIface (stepOn buf)
 280     ']'# -> ITcbrack     : lexIface (stepOn buf)
 281     ','# -> ITcomma      : lexIface (stepOn buf)
 282     ':'# -> case lookAhead# buf 1# of
 283               ':'# -> ITdcolon  : lexIface (stepOnBy# buf 2#)
 284               _    -> lex_id (incLexeme buf)
 285     ';'#  -> ITsemi     : lexIface (stepOn buf)
 286     '\"'# -> case untilEndOfString# (stepOn buf) of
 287               buf' ->
 288                   -- the string literal does *not* include the dquotes
 289                 case lexemeToFastString buf' of
 290                  v -> ITstring v : lexIface (stepOn (stepOverLexeme buf'))
 291
 292     '\''# -> --
 293              -- untilEndOfChar# extends the current lexeme until
 294              -- it hits a non-escaped single quote. The lexeme of the
 295              -- StringBuffer returned does *not* include the closing quote,
 296              -- hence we augment the lexeme and make sure to add the
 297              -- starting quote, before `read'ing the string.
 298              --
 299              case untilEndOfChar# (stepOn buf) of
 300                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 301                         [  (ch, rest)] -> ITchar ch : lexIface (stepOverLexeme (incLexeme buf'))
 302
 303 -- ``thingy'' form for casm
 304     '`'# ->
 305             case lookAhead# buf 1# of
 306               '`'# -> lex_cstring (stepOnBy# buf 2#) -- remove the `` and go.
 307               _    -> lex_id (incLexeme buf)         -- add ` to lexeme and assume
 308                                                      -- scanning an id of some sort.
 309 -- Keywords
 310     '_'# ->
 311          case lookAhead# buf 1# of
 312            'S'# -> case lookAhead# buf 2# of
 313                     '_'# -> ITstrict :
 314                             lex_demand (stepOnUntil (not . isSpace)
 315                                                     (stepOnBy# buf 3#)) -- past _S_
 316            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 317                      Just buf' -> lex_scc (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 318                      Nothing   -> lex_keyword (stepOnBy# buf 1#) -- drop the '_' and assume
 319                                                                  -- it is a keyword.
 320            _    -> lex_keyword (stepOn buf)
 321
 322     '\NUL'# ->
 323             if bufferExhausted (stepOn buf) then
 324                []
 325             else
 326                lex_id buf
 327     c ->
 328         if isDigit (C# c) then
 329            lex_num (id) (ord# c -# ord# '0'#) (incLexeme buf)
 330         else
 331            lex_id buf
 332 --  where
 333 lex_comment buf =
 334 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 335    case untilChar# buf '\n'# of {buf' -> lexIface (stepOverLexeme buf')}
 336
 337 ------------------
 338 lex_demand buf =
 339 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 340  case read_em [] buf of { (ls,buf') -> ITdemand ls : lexIface (stepOverLexeme buf')}
 341  where
 342    -- code snatched from Demand.lhs
 343   read_em acc buf =
 344 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 345    case currentChar# buf of
 346     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 347     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 348     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 349     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 350     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 351     ')'# -> (reverse acc, stepOn buf)
 352     'U'# -> do_unpack DataType True  acc (stepOnBy# buf 2#)
 353     'u'# -> do_unpack DataType False acc (stepOnBy# buf 2#)
 354     'N'# -> do_unpack NewType True  acc (stepOnBy# buf 2#)
 355     'n'# -> do_unpack NewType False acc (stepOnBy# buf 2#)
 356     _    -> (reverse acc, buf)
 357
 358   do_unpack new_or_data wrapper_unpacks acc buf
 359    = case read_em [] buf of
 360       (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
 361
 362 ------------------
 363 lex_scc buf =
 364 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 365  case currentChar# buf of
 366   '"'# ->
 367       -- YUCK^2
 368      case prefixMatch (stepOn buf) "NO_CC\"" of
 369       Just buf' -> ITscc noCostCentre : lexIface (stepOverLexeme buf')
 370       Nothing ->
 371        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 372         Just buf' -> ITscc useCurrentCostCentre : lexIface (stepOverLexeme buf')
 373         Nothing   ->
 374          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 375          Just buf' -> ITscc overheadCostCentre : lexIface (stepOverLexeme buf')
 376          Nothing   ->
 377           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 378            Just buf' -> ITscc dontCareCostCentre : lexIface (stepOverLexeme buf')
 379            Nothing   ->
 380             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 381              Just buf' -> ITscc subsumedCosts : lexIface (stepOverLexeme buf')
 382              Nothing ->
 383               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 384                Just buf' -> ITscc preludeCafsCostCentre : lexIface (stepOverLexeme buf')
 385                Nothing ->
 386                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 387                  Just buf' ->
 388                   case untilChar# (stepOverLexeme buf') '\"'# of
 389                    buf'' -> ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_):
 390                             lexIface (stepOn (stepOverLexeme buf''))
 391                  Nothing ->
 392                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 393                    Just buf' -> ITscc (preludeDictsCostCentre True) : lexIface (stepOverLexeme buf')
 394                    Nothing ->
 395                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 396                      Just buf' ->
 397                       case untilChar# (stepOverLexeme buf') '\"'# of
 398                        buf'' -> ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True):
 399                                 lexIface (stepOn (stepOverLexeme buf''))
 400                      Nothing ->
 401                       let
 402                        match_user_cc buf =
 403                         case untilChar# buf '/'# of
 404                          buf' ->
 405                           let mod_name = lexemeToFastString buf' in
 406 --                        case untilChar# (stepOn (stepOverLexeme buf')) '/'# of
 407 --                         buf'' ->
 408 --                            let grp_name = lexemeToFastString buf'' in
 409                             case untilChar# (stepOn (stepOverLexeme buf')) '\"'# of
 410                              buf'' ->
 411                                let cc_name = lexemeToFastString buf'' in
 412                                (mkUserCC cc_name mod_name _NIL_{-grp_name-},
 413                                 stepOn (stepOverLexeme buf''))
 414                       in
 415                       case prefixMatch (stepOn buf) "CAF:" of
 416                        Just buf' ->
 417                          case match_user_cc (stepOverLexeme buf') of
 418                           (cc, buf'') -> ITscc (cafifyCC cc) : lexIface buf''
 419                        Nothing ->
 420                          case match_user_cc (stepOn buf) of
 421                           (cc, buf'') -> ITscc cc : lexIface buf''
 422   c -> ITunknown [C# c] : lexIface (stepOn buf)
 423
 424
 425 -----------
 426 lex_num :: (Int -> Int) -> Int# -> StringBuffer -> [IfaceToken]
 427 lex_num minus acc# buf =
 428 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 429  case scanNumLit (I# acc#) buf of
 430      (acc',buf') ->
 431        case currentChar# buf' of
 432          '.'# ->
 433              -- this case is not optimised at all, as the
 434              -- presence of floating point numbers in interface
 435              -- files is not that common. (ToDo)
 436             case expandWhile (isDigit) (incLexeme buf') of
 437               buf'' -> -- points to first non digit char
 438                 case reads (lexemeToString buf'') of
 439                   [(v,_)] -> ITdouble v : lexIface (stepOverLexeme buf'')
 440          _ -> ITinteger (fromInt (minus acc')) : lexIface (stepOverLexeme buf')
 441
 442 --         case reads (lexemeToString buf') of
 443 --           [(i,_)] -> ITinteger i : lexIface (stepOverLexeme buf')
 444
 445 ------------
 446 lex_keyword buf =
 447 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 448  case currentChar# buf of
 449   ':'# -> case lookAhead# buf 1# of
 450             '_'# -> -- a binding, type (and other id-info) follows,
 451                     -- to make the parser ever so slightly, we push
 452                     --
 453                 lex_decl (stepOnBy# buf 2#)
 454             v# -> ITunknown (['_',':',C# v#]) : lexIface (stepOnBy# buf 2#)
 455   _ ->
 456     case expandWhile (is_kwd_char) buf of
 457      buf' ->
 458       let kw = lexemeToFastString buf' in
 459 --    _trace ("kw: "++lexemeToString buf') $
 460       case lookupUFM ifaceKeywordsFM kw of
 461        Nothing -> ITunknown (_UNPK_ kw) : -- (minor) sigh
 462                   lexIface (stepOverLexeme buf')
 463        Just xx -> xx : lexIface (stepOverLexeme buf')
 464
 465 lex_decl buf =
 466  case doDiscard False buf of -- spin until ;; is found
 467    buf' ->
 468       {- _trace (show (lexemeToString buf')) $ -}
 469       case currentChar# buf' of
 470        '\n'# -> -- newline, no id info.
 471            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 472            lexIface (stepOverLexeme buf')
 473        '\r'# -> -- just to be sure for those Win* boxes..
 474            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 475            lexIface (stepOverLexeme buf')
 476        '\NUL'# ->
 477            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 478            lexIface (stepOverLexeme buf')
 479        c     -> -- run all over the id info
 480          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 481            buf'' ->
 482                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 483                     --_trace (show (lexemeToString (decLexeme buf''))) $
 484                     ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))):
 485                     let ls = lexIface (stepOverLexeme buf'') in
 486                     if opt_IgnoreIfacePragmas then
 487                         ls
 488                     else
 489                         let is = lexIface (lexemeToBuffer (decLexeme buf'')) in
 490                         --_trace (show is) $
 491                         ITidinfo is : ls
 492
 493 -- ToDo: hammer!
 494 is_kwd_char c@(C# c#) =
 495  isAlphanum c || -- OLD: c `elem` "_@/\\"
 496  (case c# of
 497    '_'#  -> True
 498    '@'#  -> True
 499    '/'#  -> True
 500    '\\'# -> True
 501    _     -> False)
 502
 503
 504
 505 -----------
 506 lex_cstring buf =
 507 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 508  case expandUntilMatch buf "\'\'" of
 509    buf' -> ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))) :
 510            lexIface (stepOverLexeme buf')
 511
 512 -----------
 513 lex_tuple module_dot buf =
 514 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 515   go 2 buf
 516   where
 517    go n buf =
 518     case currentChar# buf of
 519       ','# -> go (n+1) (stepOn buf)
 520       ')'# -> end_lex_id module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 521       _    -> ITunknown ("tuple " ++ show n) : lexIface buf
 522
 523 -- Similarly ' itself is ok inside an identifier, but not at the start
 524
 525 id_arr :: _ByteArray Int
 526 id_arr =
 527  unsafePerformPrimIO (
 528   newCharArray (0,255) `thenPrimIO` \ barr ->
 529   let
 530    loop 256# = returnPrimIO ()
 531    loop i# =
 532     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 533        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 534        loop (i# +# 1#)
 535     else
 536        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 537        loop (i# +# 1#)
 538   in
 539   loop 0#                    `seqPrimIO`
 540   unsafeFreezeByteArray barr)
 541
 542 is_id_char (C# c#) =
 543  let
 544   _ByteArray _ arr# = id_arr
 545  in
 546  case ord# (indexCharArray# arr# (ord# c#)) of
 547   0# -> False
 548   1# -> True
 549
 550 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 551
 552 is_sym c#=
 553  case c# of {
 554    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 555    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 556    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 557    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 558    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 559    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 560
 561 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 562
 563
 564 mod_arr :: _ByteArray Int
 565 mod_arr =
 566  unsafePerformPrimIO (
 567   newCharArray (0,255) `thenPrimIO` \ barr ->
 568   let
 569    loop 256# = returnPrimIO ()
 570    loop i# =
 571     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 572        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 573        loop (i# +# 1#)
 574     else
 575        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 576        loop (i# +# 1#)
 577   in
 578   loop 0#                    `seqPrimIO`
 579   unsafeFreezeByteArray barr)
 580
 581
 582 is_mod_char (C# c#) =
 583  let
 584   _ByteArray _ arr# = mod_arr
 585  in
 586  case ord# (indexCharArray# arr# (ord# c#)) of
 587   0# -> False
 588   1# -> True
 589
 590 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 591
 592 {-
 593 lex_id cs =
 594  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 595    (xs, len, cs') ->
 596     case cs' of
 597      [] -> case xs of
 598             [] -> lex_id2 Nothing cs
 599             _  -> lex_id3 Nothing len xs cs
 600
 601      '.':cs'' ->
 602         case xs of
 603           [] -> lex_id2 Nothing cs
 604           _  ->
 605            let
 606             pk_str = _PK_ (xs::String)
 607             len = lengthPS pk_str
 608            in
 609            if len==len+1 then
 610               error "Well, I never!"
 611            else
 612               lex_id2 (Just pk_str) cs''
 613      _ -> case xs of
 614             [] -> lex_id2 Nothing cs
 615             _  -> lex_id3 Nothing len xs cs'
 616
 617 -}
 618
 619 lex_id buf =
 620 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 621  case expandWhile (is_mod_char) buf of
 622    buf' ->
 623     case currentChar# buf' of
 624      '.'# ->
 625         if not (emptyLexeme buf') then
 626 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 627            case lexemeToFastString buf' of
 628              l@(FastString u# l# ba#) -> lex_id2 (Just (FastString u# l# ba#))
 629                                                  (stepOn (stepOverLexeme buf'))
 630         else
 631            lex_id2 Nothing buf'
 632      _  -> lex_id2 Nothing buf'
 633
 634 -- Dealt with the Module.part
 635 lex_id2 module_dot buf =
 636 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 637  case currentChar# buf of
 638   '['# ->
 639     case lookAhead# buf 1# of
 640      ']'# -> end_lex_id module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 641      _    -> lex_id3 module_dot buf
 642   '('# ->
 643     case lookAhead# buf 1# of
 644      ')'# -> end_lex_id module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 645      ','# -> lex_tuple module_dot (stepOnBy# buf 2#)
 646      _    -> lex_id3 module_dot buf
 647   ':'# -> lex_id3 module_dot (incLexeme buf)
 648   _    -> lex_id3 module_dot buf
 649
 650
 651
 652 -- Dealt with [], (), : special cases
 653
 654 lex_id3 module_dot buf =
 655 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 656  case expandWhile (is_id_char) buf of
 657   buf' ->
 658     case module_dot of
 659      Just _ ->
 660        end_lex_id module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 661      Nothing ->
 662        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 663          Just kwd_token -> kwd_token           : lexIface new_buf
 664          Nothing        -> mk_var_token lexeme : lexIface new_buf
 665     where
 666      lexeme  = lexemeToFastString buf'
 667      new_buf = stepOverLexeme buf'
 668
 669
 670 {- OLD:
 671 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 672 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 673 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 674 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 675 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 676 -}
 677
 678
 679 -- Dealt with [], (), : special cases
 680
 681 {-
 682 lex_id3 module_dot len_xs xs cs =
 683  case my_span' (is_id_char) cs of
 684    (xs1,len_xs1,rest) ->
 685     case module_dot of
 686      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 687      Nothing ->
 688       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 689        Just kwd_token -> kwd_token          : lexIface rest
 690        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 691     where
 692      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 693 -}
 694 mk_var_token pk_str =
 695      let
 696       f = _HEAD_ pk_str
 697      in
 698      --
 699      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 700      -- remove the second half of disjunction when using a 1.3 prelude.
 701      --
 702      if      isUpper f    then ITconid pk_str
 703      else if isLower f    then ITvarid pk_str
 704      else if f == ':'     then ITconsym pk_str
 705      else if isLowerISO f then ITvarid pk_str
 706      else if isUpperISO f then ITconid pk_str
 707      else ITvarsym pk_str
 708
 709 {-
 710     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 711                           | f == ':'              = ITconsym n
 712                           | isAlpha f             = ITvarid n
 713                           | otherwise             = ITvarsym n
 714                 where
 715                       n = _PK_ xs
 716 -}
 717
 718 end_lex_id Nothing token buf  = token : lexIface buf
 719 end_lex_id (Just m) token buf =
 720  case token of
 721    ITconid n  -> ITqconid  (m,n)         : lexIface buf
 722    ITvarid n  -> ITqvarid  (m,n)         : lexIface buf
 723    ITconsym n -> ITqconsym (m,n)         : lexIface buf
 724    ITvarsym n -> ITqvarsym (m,n)         : lexIface buf
 725    ITbang     -> ITqvarsym (m,SLIT("!")) : lexIface buf
 726    _          -> ITunknown (show token)  : lexIface buf
 727
 728 ------------
 729 ifaceKeywordsFM :: UniqFM IfaceToken
 730 ifaceKeywordsFM = listToUFM $
 731       map (\ (x,y) -> (_PK_ x,y))
 732        [("/\\_",                ITbiglam)
 733        ,("@_",                  ITatsign)
 734        ,("letrec_",             ITletrec)
 735        ,("interface_",          ITinterface)
 736        ,("usages_",             ITusages)
 737        ,("versions_",           ITversions)
 738        ,("exports_",            ITexports)
 739        ,("instance_modules_",   ITinstance_modules)
 740        ,("instances_",          ITinstances)
 741        ,("fixities_",           ITfixities)
 742        ,("declarations_",       ITdeclarations)
 743        ,("pragmas_",            ITpragmas)
 744        ,("forall_",             ITforall)
 745        ,("U_",                  ITunfold False)
 746        ,("U!_",                 ITunfold True)
 747        ,("A_",                  ITarity)
 748        ,("coerce_in_",          ITcoerce_in)
 749        ,("coerce_out_",         ITcoerce_out)
 750        ,("bot_",                ITbottom)
 751        ,("integer_",            ITinteger_lit)
 752        ,("rational_",           ITrational_lit)
 753        ,("addr_",               ITaddr_lit)
 754        ,("float_",              ITfloat_lit)
 755        ,("string_",             ITstring_lit)
 756        ,("litlit_",             ITlit_lit)
 757        ,("ccall_",              ITccall (False, False))
 758        ,("ccall_GC_",           ITccall (False, True))
 759        ,("casm_",               ITccall (True,  False))
 760        ,("casm_GC_",            ITccall (True,  True))
 761        ]
 762
 763 haskellKeywordsFM = listToUFM $
 764       map (\ (x,y) -> (_PK_ x,y))
 765       [ ("data",                ITdata)
 766        ,("type",                ITtype)
 767        ,("newtype",             ITnewtype)
 768        ,("class",               ITclass)
 769        ,("where",               ITwhere)
 770        ,("instance",            ITinstance)
 771        ,("infixl",              ITinfixl)
 772        ,("infixr",              ITinfixr)
 773        ,("infix",               ITinfix)
 774        ,("case",                ITcase)
 775        ,("case#",               ITprim_case)
 776        ,("of",                  ITof)
 777        ,("in",                  ITin)
 778        ,("let",                 ITlet)
 779        ,("deriving",            ITderiving)
 780
 781        ,("->",                  ITrarrow)
 782        ,("\\",                  ITlam)
 783        ,("|",                   ITvbar)
 784        ,("!",                   ITbang)
 785        ,("=>",                  ITdarrow)
 786        ,("=",                   ITequal)
 787        ]
 788
 789
 790 -- doDiscard rips along really fast looking for a double semicolon,
 791 -- indicating the end of the pragma we're skipping
 792 doDiscard inStr buf =
 793 -- _trace (show (C# (currentChar# buf))) $
 794  case currentChar# buf of
 795    ';'# ->
 796      if not inStr then
 797        case lookAhead# buf 1# of
 798         ';'# -> incLexeme (incLexeme buf)
 799         _    -> doDiscard inStr (incLexeme buf)
 800      else
 801        doDiscard inStr (incLexeme buf)
 802    '"'# ->
 803        let
 804         odd_slashes buf flg i# =
 805           case lookAhead# buf i# of
 806            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 807            _     -> flg
 808        in
 809        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 810          '\\'# -> -- escaping something..
 811            if odd_slashes buf True (negateInt# 2#) then
 812                -- odd number of slashes, " is escaped.
 813               doDiscard inStr (incLexeme buf)
 814            else
 815                -- even number of slashes, \ is escaped.
 816               doDiscard (not inStr) (incLexeme buf)
 817          _ -> case inStr of -- forced to avoid build-up
 818                True  -> doDiscard False (incLexeme buf)
 819                False -> doDiscard True  (incLexeme buf)
 820    _ -> doDiscard inStr (incLexeme buf)
 821
 822 \end{code}
 823
 824 begin{code}
 825 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 826 my_span p xs = go [] xs
 827   where
 828     go so_far (x:xs') | p x = go (x:so_far) xs'
 829     go so_far xs            = (reverse so_far, xs)
 830
 831 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 832 my_span' p xs = go [] 0 xs
 833   where
 834     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 835     go so_far n xs            = (reverse so_far,n, xs)
 836 end{code}
 837
 838
 839 %************************************************************************
 840 %*                                                                      *
 841 \subsection{Other utility functions
 842 %*                                                                      *
 843 %************************************************************************
 844
 845 \begin{code}
 846 type IfM a = MaybeErr a Error
 847
 848 returnIf   :: a -> IfM a
 849 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 850 happyError :: Int -> [IfaceToken] -> IfM a
 851
 852 returnIf a = Succeeded a
 853
 854 thenIf (Succeeded a) k = k a
 855 thenIf (Failed  err) _ = Failed err
 856
 857 happyError ln toks = Failed (ifaceParseErr ln toks)
 858
 859 -----------------------------------------------------------------
 860
 861 ifaceParseErr ln toks sty
 862   = hsep [ptext SLIT("Interface-file parse error: line"), int ln, ptext SLIT("toks="), text (show (take 10 toks))]
 863 \end{code}