ghc/compiler/reader/Lex.lhs

   1 %
   2 % (c) The GRASP/AQUA Project, Glasgow University, 1992-1996
   3 %
   4 \section[Lexical analysis]{Lexical analysis}
   5
   6 \begin{code}
   7 #include "HsVersions.h"
   8
   9 module Lex (
  10
  11         isLexCon, isLexVar, isLexId, isLexSym,
  12         isLexConId, isLexConSym, isLexVarId, isLexVarSym,
  13         mkTupNameStr, ifaceParseErr,
  14
  15         -- Monad for parser
  16         IfaceToken(..), lexIface, SYN_IE(IfM), thenIf, returnIf, happyError,
  17         StringBuffer
  18
  19     ) where
  20
  21
  22 IMPORT_1_3(Char(isDigit, isAlpha, isAlphanum, isUpper,isLower, isSpace, ord))
  23 IMPORT_DELOOPER(Ubiq)
  24 IMPORT_DELOOPER(IdLoop)    -- get the CostCentre type&constructors from here
  25
  26 import CmdLineOpts      ( opt_IgnoreIfacePragmas )
  27 import Demand           ( Demand(..) {- instance Read -} )
  28 import UniqFM           ( UniqFM, listToUFM, lookupUFM)
  29 --import FiniteMap      ( FiniteMap, listToFM, lookupFM )
  30 #if __GLASGOW_HASKELL__ >= 202
  31 import Maybes           ( MaybeErr(..) )
  32 #else
  33 import Maybes           ( Maybe(..), MaybeErr(..) )
  34 #endif
  35 import Pretty
  36
  37
  38
  39 import ErrUtils         ( Error(..) )
  40 import Outputable       ( Outputable(..) )
  41 import PprStyle         ( PprStyle(..) )
  42 import Util             ( nOfThem, panic )
  43
  44 import FastString
  45 import StringBuffer
  46
  47 #if __GLASGOW_HASKELL__ <= 201
  48 import PreludeGlaST
  49 #else
  50 import GlaExts
  51 #endif
  52 \end{code}
  53
  54 %************************************************************************
  55 %*                                                                      *
  56 \subsection{Lexical categories}
  57 %*                                                                      *
  58 %************************************************************************
  59
  60 These functions test strings to see if they fit the lexical categories
  61 defined in the Haskell report.  Normally applied as in e.g. @isCon
  62 (getLocalName foo)@.
  63
  64 \begin{code}
  65 isLexCon, isLexVar, isLexId, isLexSym, isLexConId, isLexConSym,
  66  isLexVarId, isLexVarSym  :: FAST_STRING -> Bool
  67
  68 isLexCon cs = isLexConId  cs || isLexConSym cs
  69 isLexVar cs = isLexVarId  cs || isLexVarSym cs
  70
  71 isLexId  cs = isLexConId  cs || isLexVarId  cs
  72 isLexSym cs = isLexConSym cs || isLexVarSym cs
  73
  74 -------------
  75
  76 isLexConId cs
  77   | _NULL_ cs        = False
  78   | cs == SLIT("[]") = True
  79   | c  == '('        = True     -- (), (,), (,,), ...
  80   | otherwise        = isUpper c || isUpperISO c
  81   where
  82     c = _HEAD_ cs
  83
  84 isLexVarId cs
  85   | _NULL_ cs    = False
  86   | otherwise    = isLower c || isLowerISO c
  87   where
  88     c = _HEAD_ cs
  89
  90 isLexConSym cs
  91   | _NULL_ cs   = False
  92   | otherwise   = c  == ':'
  93                || cs == SLIT("->")
  94   where
  95     c = _HEAD_ cs
  96
  97 isLexVarSym cs
  98   | _NULL_ cs = False
  99   | otherwise = isSymbolASCII c
 100              || isSymbolISO c
 101   where
 102     c = _HEAD_ cs
 103
 104 -------------
 105 isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
 106 isSymbolISO   c = ord c `elem` (0xd7 : 0xf7 : [0xa1 .. 0xbf])
 107 isUpperISO    (C# c#) = c# `geChar#` '\xc0'# && c# `leChar#` '\xde'# && c# `neChar#` '\xd7'#
 108 --0xc0 <= oc && oc <= 0xde && oc /= 0xd7 where oc = ord c
 109 isLowerISO    (C# c#) = c# `geChar#` '\xdf'# && c# `leChar#` '\xff'# && c# `neChar#` '\xf7'#
 110 --0xdf <= oc && oc <= 0xff && oc /= 0xf7 where oc = ord c
 111 \end{code}
 112
 113
 114 %************************************************************************
 115 %*                                                                      *
 116 \subsection{Tuple strings -- ugh!}
 117 %*                                                                      *
 118 %************************************************************************
 119
 120 \begin{code}
 121 mkTupNameStr 0 = SLIT("()")
 122 mkTupNameStr 1 = panic "Name.mkTupNameStr: 1 ???"
 123 mkTupNameStr 2 = _PK_ "(,)"   -- not strictly necessary
 124 mkTupNameStr 3 = _PK_ "(,,)"  -- ditto
 125 mkTupNameStr 4 = _PK_ "(,,,)" -- ditto
 126 mkTupNameStr n = _PK_ ("(" ++ nOfThem (n-1) ',' ++ ")")
 127 \end{code}
 128
 129
 130
 131 %************************************************************************
 132 %*                                                                      *
 133 \subsection{Data types}
 134 %*                                                                      *
 135 %************************************************************************
 136
 137 The token data type, fairly un-interesting except from two constructors,
 138 @ITidinfo@ and @ITtype@, which are used to lazily lex id info (arity,
 139 strictness, unfolding etc) and types for id decls.
 140
 141 The Idea/Observation here is that the renamer needs to scan through
 142 all of an interface file before it can continue. But only a fraction
 143 of the information contained in the file turns out to be useful, so
 144 delaying as much as possible of the scanning and parsing of an
 145 interface file Makes Sense (Heap profiles of the compiler
 146 show at a reduction in heap usage by at least a factor of two,
 147 post-renamer).
 148
 149 Hence, the interface file lexer spots when value declarations are
 150 being scanned and return the @ITidinfo@ and @ITtype@ constructors
 151 for the type and any other id info for that binding (unfolding, strictness
 152 etc). These constructors are applied to the result of lexing these sub-chunks.
 153
 154 The lexing of the type and id info is all done lazily, of course, so
 155 the scanning (and subsequent parsing) will be done *only* on the ids the
 156 renamer finds out that it is interested in. The rest will just be junked.
 157 Laziness, you know it makes sense :-)
 158
 159 \begin{code}
 160 data IfaceToken
 161   = ITinterface         -- keywords
 162   | ITusages
 163   | ITversions
 164   | ITexports
 165   | ITinstance_modules
 166   | ITinstances
 167   | ITfixities
 168   | ITdeclarations
 169   | ITpragmas
 170   | ITdata
 171   | ITtype
 172   | ITnewtype
 173   | ITderiving
 174   | ITclass
 175   | ITwhere
 176   | ITinstance
 177   | ITinfixl
 178   | ITinfixr
 179   | ITinfix
 180   | ITforall
 181   | ITbang              -- magic symbols
 182   | ITvbar
 183   | ITdcolon
 184   | ITcomma
 185   | ITdarrow
 186   | ITdotdot
 187   | ITequal
 188   | ITocurly
 189   | ITobrack
 190   | IToparen
 191   | ITrarrow
 192   | ITccurly
 193   | ITcbrack
 194   | ITcparen
 195   | ITsemi
 196   | ITvarid   FAST_STRING
 197   | ITconid   FAST_STRING
 198   | ITvarsym  FAST_STRING
 199   | ITconsym  FAST_STRING
 200   | ITqvarid  (FAST_STRING,FAST_STRING)
 201   | ITqconid  (FAST_STRING,FAST_STRING)
 202   | ITqvarsym (FAST_STRING,FAST_STRING)
 203   | ITqconsym (FAST_STRING,FAST_STRING)
 204
 205   | ITidinfo [IfaceToken]  -- lazily return the stream of tokens for
 206                            -- the info attached to an id.
 207   | ITtysig [IfaceToken]   -- lazily return the stream of tokens for
 208                            -- the info attached to an id.
 209         -- Stuff for reading unfoldings
 210   | ITarity | ITstrict | ITunfold
 211   | ITdemand [Demand] | ITbottom
 212   | ITlam | ITbiglam | ITcase | ITprim_case | ITlet | ITletrec | ITin | ITof
 213   | ITcoerce_in | ITcoerce_out | ITatsign
 214   | ITccall (Bool,Bool)         -- (is_casm, may_gc)
 215   | ITscc CostCentre
 216   | ITchar Char | ITstring FAST_STRING
 217   | ITinteger Integer | ITdouble Double
 218   | ITinteger_lit | ITfloat_lit | ITrational_lit | ITaddr_lit | ITlit_lit | ITstring_lit
 219   | ITunknown String            -- Used when the lexer can't make sense of it
 220   deriving Text -- debugging
 221
 222 instance Text CostCentre -- cheat!
 223
 224 \end{code}
 225
 226 %************************************************************************
 227 %*                                                                      *
 228 \subsection{The lexical analyser}
 229 %*                                                                      *
 230 %************************************************************************
 231
 232 \begin{code}
 233 lexIface :: StringBuffer -> [IfaceToken]
 234 lexIface buf =
 235  _scc_ "Lexer"
 236 -- if bufferExhausted buf then
 237 --  []
 238 -- else
 239 --  _trace ("Lexer: "++[C# (currentChar# buf)]) $
 240   case currentChar# buf of
 241       -- whitespace and comments, ignore.
 242     ' '#  -> lexIface (stepOn buf)
 243     '\t'# -> lexIface (stepOn buf)
 244     '\n'# -> lexIface (stepOn buf)
 245
 246 -- Numbers and comments
 247     '-'#  ->
 248       case lookAhead# buf 1# of
 249         '-'# -> lex_comment (stepOnBy# buf 2#)
 250         c    ->
 251           if isDigit (C# c)
 252           then lex_num (negate) (ord# c -# ord# '0'#) (incLexeme (incLexeme buf))
 253           else lex_id buf
 254
 255 -- Leave out nested comments for now; danger of finding { and - juxtaposed by mistake?
 256 --    '{' : '-' : cs -> lex_nested_comment 1{-one seen-} cs
 257
 258     '('# ->
 259          case prefixMatch (stepOn buf) "..)" of
 260            Just buf' ->  ITdotdot : lexIface (stepOverLexeme buf')
 261            Nothing ->
 262             case lookAhead# buf 1# of
 263               ','# -> lex_tuple Nothing  (stepOnBy# buf 2#)
 264               ')'# -> ITconid SLIT("()") : lexIface (stepOnBy# buf 2#)
 265               _    -> IToparen : lexIface (stepOn buf)
 266
 267     '{'# -> ITocurly : lexIface (stepOn buf)
 268     '}'# -> ITccurly : lexIface (stepOn buf)
 269     ')'# -> ITcparen : lexIface (stepOn buf)
 270     '['# ->
 271       case lookAhead# buf 1# of
 272         ']'# -> ITconid SLIT("[]") : lexIface (stepOnBy# buf 2#)
 273         _    -> ITobrack : lexIface (stepOn buf)
 274     ']'# -> ITcbrack     : lexIface (stepOn buf)
 275     ','# -> ITcomma      : lexIface (stepOn buf)
 276     ':'# -> case lookAhead# buf 1# of
 277               ':'# -> ITdcolon  : lexIface (stepOnBy# buf 2#)
 278               _    -> lex_id (incLexeme buf)
 279     ';'#  -> ITsemi     : lexIface (stepOn buf)
 280     '\"'# -> case untilEndOfString# (stepOn buf) of
 281               buf' ->
 282                   -- the string literal does *not* include the dquotes
 283                 case lexemeToFastString buf' of
 284                  v -> ITstring v : lexIface (stepOn (stepOverLexeme buf'))
 285
 286     '\''# -> --
 287              -- untilEndOfChar# extends the current lexeme until
 288              -- it hits a non-escaped single quote. The lexeme of the
 289              -- StringBuffer returned does *not* include the closing quote,
 290              -- hence we augment the lexeme and make sure to add the
 291              -- starting quote, before `read'ing the string.
 292              --
 293              case untilEndOfChar# (stepOn buf) of
 294                buf' -> case reads ('\'':lexemeToString (incLexeme buf')) of
 295                         [  (ch, rest)] -> ITchar ch : lexIface (stepOverLexeme (incLexeme buf'))
 296
 297 -- ``thingy'' form for casm
 298     '`'# ->
 299             case lookAhead# buf 1# of
 300               '`'# -> lex_cstring (stepOnBy# buf 2#) -- remove the `` and go.
 301               _    -> lex_id (incLexeme buf)         -- add ` to lexeme and assume
 302                                                      -- scanning an id of some sort.
 303 -- Keywords
 304     '_'# ->
 305          case lookAhead# buf 1# of
 306            'S'# -> case lookAhead# buf 2# of
 307                     '_'# -> ITstrict :
 308                             lex_demand (stepOnUntil (not . isSpace)
 309                                                     (stepOnBy# buf 3#)) -- past _S_
 310            's'# -> case prefixMatch (stepOnBy# buf 2#) "cc_" of
 311                      Just buf' -> lex_scc (stepOnUntil (not . isSpace) (stepOverLexeme buf'))
 312                      Nothing   -> lex_keyword (stepOnBy# buf 1#) -- drop the '_' and assume
 313                                                                  -- it is a keyword.
 314            _    -> lex_keyword (stepOn buf)
 315
 316     '\NUL'# ->
 317             if bufferExhausted (stepOn buf) then
 318                []
 319             else
 320                lex_id buf
 321     c ->
 322         if isDigit (C# c) then
 323            lex_num (id) (ord# c -# ord# '0'#) (incLexeme buf)
 324         else
 325            lex_id buf
 326 --  where
 327 lex_comment buf =
 328 --   _trace ("comment: "++[C# (currentChar# buf)]) $
 329    case untilChar# buf '\n'# of {buf' -> lexIface (stepOverLexeme buf')}
 330
 331 ------------------
 332 lex_demand buf =
 333 -- _trace ("demand: "++[C# (currentChar# buf)]) $
 334  case read_em [] buf of { (ls,buf') -> ITdemand ls : lexIface (stepOverLexeme buf')}
 335  where
 336    -- code snatched from Demand.lhs
 337   read_em acc buf =
 338 --   _trace ("read_em: "++[C# (currentChar# buf)]) $
 339    case currentChar# buf of
 340     'L'# -> read_em (WwLazy False : acc) (stepOn buf)
 341     'A'# -> read_em (WwLazy True  : acc) (stepOn buf)
 342     'S'# -> read_em (WwStrict     : acc) (stepOn buf)
 343     'P'# -> read_em (WwPrim       : acc) (stepOn buf)
 344     'E'# -> read_em (WwEnum       : acc) (stepOn buf)
 345     ')'# -> (reverse acc, stepOn buf)
 346     'U'# -> do_unpack True  acc (stepOnBy# buf 2#)
 347     'u'# -> do_unpack False acc (stepOnBy# buf 2#)
 348     _    -> (reverse acc, buf)
 349
 350   do_unpack wrapper_unpacks acc buf
 351    = case read_em [] buf of
 352       (stuff, rest) -> read_em (WwUnpack wrapper_unpacks stuff : acc) rest
 353
 354 ------------------
 355 lex_scc buf =
 356 -- _trace ("scc: "++[C# (currentChar# buf)]) $
 357  case currentChar# buf of
 358   '"'# ->
 359       -- YUCK^2
 360      case prefixMatch (stepOn buf) "NO_CC\"" of
 361       Just buf' -> ITscc noCostCentre : lexIface (stepOverLexeme buf')
 362       Nothing ->
 363        case prefixMatch (stepOn buf) "CURRENT_CC\"" of
 364         Just buf' -> ITscc useCurrentCostCentre : lexIface (stepOverLexeme buf')
 365         Nothing   ->
 366          case prefixMatch (stepOn buf) "OVERHEAD\"" of
 367          Just buf' -> ITscc overheadCostCentre : lexIface (stepOverLexeme buf')
 368          Nothing   ->
 369           case prefixMatch (stepOn buf) "DONT_CARE\"" of
 370            Just buf' -> ITscc dontCareCostCentre : lexIface (stepOverLexeme buf')
 371            Nothing   ->
 372             case prefixMatch (stepOn buf) "SUBSUMED\"" of
 373              Just buf' -> ITscc subsumedCosts : lexIface (stepOverLexeme buf')
 374              Nothing ->
 375               case prefixMatch (stepOn buf) "CAFs_in_...\"" of
 376                Just buf' -> ITscc preludeCafsCostCentre : lexIface (stepOverLexeme buf')
 377                Nothing ->
 378                 case prefixMatch (stepOn buf) "CC_CAFs_in_..." of
 379                  Just buf' ->
 380                   case untilChar# (stepOverLexeme buf') '\"'# of
 381                    buf'' -> ITscc (mkAllCafsCC ({-module-}lexemeToFastString buf'') _NIL_):
 382                             lexIface (stepOn (stepOverLexeme buf''))
 383                  Nothing ->
 384                   case prefixMatch (stepOn buf) "DICTs_in_...\"" of
 385                    Just buf' -> ITscc (preludeDictsCostCentre True) : lexIface (stepOverLexeme buf')
 386                    Nothing ->
 387                     case prefixMatch (stepOn buf) "CC_DICTs_in_..." of
 388                      Just buf' ->
 389                       case untilChar# (stepOverLexeme buf') '\"'# of
 390                        buf'' -> ITscc (mkAllDictsCC (lexemeToFastString buf'') _NIL_ True):
 391                                 lexIface (stepOn (stepOverLexeme buf''))
 392                      Nothing ->
 393                       case prefixMatch (stepOn buf) "CAF:" of
 394                        Just buf' ->
 395                         case untilChar# (stepOverLexeme buf') '\"'# of
 396                          buf'' -> ITscc (cafifyCC (mkUserCC (lexemeToFastString buf'') _NIL_ _NIL_)):
 397                                   lexIface (stepOn (stepOverLexeme buf''))
 398                        Nothing ->
 399                         case untilChar# (stepOn buf) '\"'# of
 400                            buf' -> ITscc (mkUserCC (lexemeToFastString buf') _NIL_ _NIL_):
 401                                    lexIface (stepOn (stepOverLexeme buf'))
 402   c -> ITunknown [C# c] : lexIface (stepOn buf)
 403
 404
 405 -----------
 406 lex_num :: (Int -> Int) -> Int# -> StringBuffer -> [IfaceToken]
 407 lex_num minus acc# buf =
 408 -- _trace ("lex_num: "++[C# (currentChar# buf)]) $
 409  case scanNumLit (I# acc#) buf of
 410      (acc',buf') ->
 411        case currentChar# buf' of
 412          '.'# ->
 413              -- this case is not optimised at all, as the
 414              -- presence of floating point numbers in interface
 415              -- files is not that common. (ToDo)
 416             case expandWhile (isDigit) (incLexeme buf') of
 417               buf'' -> -- points to first non digit char
 418                 case reads (lexemeToString buf'') of
 419                   [(v,_)] -> ITdouble v : lexIface (stepOverLexeme buf'')
 420          _ -> ITinteger (fromInt (minus acc')) : lexIface (stepOverLexeme buf')
 421
 422 --         case reads (lexemeToString buf') of
 423 --           [(i,_)] -> ITinteger i : lexIface (stepOverLexeme buf')
 424
 425 ------------
 426 lex_keyword buf =
 427 -- _trace ("lex_keyword: "++[C# (currentChar# buf)]) $
 428  case currentChar# buf of
 429   ':'# -> case lookAhead# buf 1# of
 430             '_'# -> -- a binding, type (and other id-info) follows,
 431                     -- to make the parser ever so slightly, we push
 432                     --
 433                 lex_decl (stepOnBy# buf 2#)
 434             v# -> ITunknown (['_',':',C# v#]) : lexIface (stepOnBy# buf 2#)
 435   _ ->
 436     case expandWhile (is_kwd_char) buf of
 437      buf' ->
 438       let kw = lexemeToFastString buf' in
 439 --    _trace ("kw: "++lexemeToString buf') $
 440       case lookupUFM ifaceKeywordsFM kw of
 441        Nothing -> ITunknown (_UNPK_ kw) : -- (minor) sigh
 442                   lexIface (stepOverLexeme buf')
 443        Just xx -> xx : lexIface (stepOverLexeme buf')
 444
 445 lex_decl buf =
 446  case doDiscard False buf of -- spin until ;; is found
 447    buf' ->
 448       {- _trace (show (lexemeToString buf')) $ -}
 449       case currentChar# buf' of
 450        '\n'# -> -- newline, no id info.
 451            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 452            lexIface (stepOverLexeme buf')
 453        '\r'# -> -- just to be sure for those Win* boxes..
 454            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 455            lexIface (stepOverLexeme buf')
 456        '\NUL'# ->
 457            ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))) :
 458            lexIface (stepOverLexeme buf')
 459        c     -> -- run all over the id info
 460          case doDiscard False (stepOverLexeme buf') of -- spin until ;; is found (outside a string!)
 461            buf'' ->
 462                     --_trace ((C# c):show (lexemeToString (decLexeme buf')))  $
 463                     --_trace (show (lexemeToString (decLexeme buf''))) $
 464                     ITtysig (lexIface (lexemeToBuffer (decLexeme buf'))):
 465                     let ls = lexIface (stepOverLexeme buf'') in
 466                     if opt_IgnoreIfacePragmas then
 467                         ls
 468                     else
 469                         let is = lexIface (lexemeToBuffer (decLexeme buf'')) in
 470                         --_trace (show is) $
 471                         ITidinfo is : ls
 472
 473 -- ToDo: hammer!
 474 is_kwd_char c@(C# c#) =
 475  isAlphanum c || -- OLD: c `elem` "_@/\\"
 476  (case c# of
 477    '_'#  -> True
 478    '@'#  -> True
 479    '/'#  -> True
 480    '\\'# -> True
 481    _     -> False)
 482
 483
 484
 485 -----------
 486 lex_cstring buf =
 487 -- _trace ("lex_cstring: "++[C# (currentChar# buf)]) $
 488  case expandUntilMatch buf "\'\'" of
 489    buf' -> ITstring (lexemeToFastString (setCurrentPos# buf' (negateInt# 2#))) :
 490            lexIface (stepOverLexeme buf')
 491
 492 -----------
 493 lex_tuple module_dot buf =
 494 -- _trace ("lex_tuple: "++[C# (currentChar# buf)]) $
 495   go 2 buf
 496   where
 497    go n buf =
 498     case currentChar# buf of
 499       ','# -> go (n+1) (stepOn buf)
 500       ')'# -> end_lex_id module_dot (ITconid (mkTupNameStr n)) (stepOn buf)
 501       _    -> ITunknown ("tuple " ++ show n) : lexIface buf
 502
 503 -- Similarly ' itself is ok inside an identifier, but not at the start
 504
 505 id_arr :: _ByteArray Int
 506 id_arr =
 507  unsafePerformPrimIO (
 508   newCharArray (0,255) `thenPrimIO` \ barr ->
 509   let
 510    loop 256# = returnPrimIO ()
 511    loop i# =
 512     if isAlphanum (C# (chr# i#)) || is_sym (chr# i#) then
 513        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 514        loop (i# +# 1#)
 515     else
 516        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 517        loop (i# +# 1#)
 518   in
 519   loop 0#                    `seqPrimIO`
 520   unsafeFreezeByteArray barr)
 521
 522 is_id_char (C# c#) =
 523  let
 524   _ByteArray _ arr# = id_arr
 525  in
 526  case ord# (indexCharArray# arr# (ord# c#)) of
 527   0# -> False
 528   1# -> True
 529
 530 --is_id_char c@(C# c#)  = isAlphanum c || is_sym c#
 531
 532 is_sym c#=
 533  case c# of {
 534    ':'# -> True; '_'#  -> True; '\''# -> True; '!'# -> True;
 535    '#'# -> True; '$'#  -> True; ':'#  -> True; '%'# -> True;
 536    '&'# -> True; '*'#  -> True; '+'#  -> True; '.'# -> True;
 537    '/'# -> True; '<'#  -> True; '='#  -> True; '>'# -> True;
 538    '?'# -> True; '\\'# -> True; '^'#  -> True; '|'# -> True;
 539    '-'# -> True; '~'#  -> True; '@'#  -> True; _    -> False }
 540
 541 --isAlphanum c || c `elem` ":_'!#$%&*+./<=>?@\\^|-~" -- ToDo: add ISOgraphic
 542
 543
 544 mod_arr :: _ByteArray Int
 545 mod_arr =
 546  unsafePerformPrimIO (
 547   newCharArray (0,255) `thenPrimIO` \ barr ->
 548   let
 549    loop 256# = returnPrimIO ()
 550    loop i# =
 551     if isAlphanum (C# (chr# i#)) || i# ==# (ord# '_'#) || i# ==# (ord# '\''#) then
 552        writeCharArray barr (I# i#) '\1' `seqPrimIO`
 553        loop (i# +# 1#)
 554     else
 555        writeCharArray barr (I# i#) '\0' `seqPrimIO`
 556        loop (i# +# 1#)
 557   in
 558   loop 0#                    `seqPrimIO`
 559   unsafeFreezeByteArray barr)
 560
 561
 562 is_mod_char (C# c#) =
 563  let
 564   _ByteArray _ arr# = mod_arr
 565  in
 566  case ord# (indexCharArray# arr# (ord# c#)) of
 567   0# -> False
 568   1# -> True
 569
 570 --isAlphanum c || c == '_' || c== '\'' --`elem` "_'"
 571
 572 {-
 573 lex_id cs =
 574  case _scc_ "lex_id.span" my_span' (is_mod_char) cs of
 575    (xs, len, cs') ->
 576     case cs' of
 577      [] -> case xs of
 578             [] -> lex_id2 Nothing cs
 579             _  -> lex_id3 Nothing len xs cs
 580
 581      '.':cs'' ->
 582         case xs of
 583           [] -> lex_id2 Nothing cs
 584           _  ->
 585            let
 586             pk_str = _PK_ (xs::String)
 587             len = lengthPS pk_str
 588            in
 589            if len==len+1 then
 590               error "Well, I never!"
 591            else
 592               lex_id2 (Just pk_str) cs''
 593      _ -> case xs of
 594             [] -> lex_id2 Nothing cs
 595             _  -> lex_id3 Nothing len xs cs'
 596
 597 -}
 598
 599 lex_id buf =
 600 -- _trace ("lex_id: "++[C# (currentChar# buf)]) $
 601  case expandWhile (is_mod_char) buf of
 602    buf' ->
 603     case currentChar# buf' of
 604      '.'# ->
 605         if not (emptyLexeme buf') then
 606 --         _trace ("lex_id: "++(C# (currentChar# (stepOverLexeme buf'))):show (lexemeToFastString buf')) $
 607            case lexemeToFastString buf' of
 608              l@(FastString u# l# ba#) -> lex_id2 (Just (FastString u# l# ba#))
 609                                                  (stepOn (stepOverLexeme buf'))
 610         else
 611            lex_id2 Nothing buf'
 612      _  -> lex_id2 Nothing buf'
 613
 614 -- Dealt with the Module.part
 615 lex_id2 module_dot buf =
 616 -- _trace ("lex_id2: "++[C# (currentChar# buf)]) $
 617  case currentChar# buf of
 618   '['# ->
 619     case lookAhead# buf 1# of
 620      ']'# -> end_lex_id module_dot (ITconid SLIT("[]")) (stepOnBy# buf 2#)
 621      _    -> lex_id3 module_dot buf
 622   '('# ->
 623     case lookAhead# buf 1# of
 624      ')'# -> end_lex_id module_dot (ITconid SLIT("()")) (stepOnBy# buf 2#)
 625      ','# -> lex_tuple module_dot (stepOnBy# buf 2#)
 626      _    -> lex_id3 module_dot buf
 627   ':'# -> lex_id3 module_dot (incLexeme buf)
 628   _    -> lex_id3 module_dot buf
 629
 630
 631
 632 -- Dealt with [], (), : special cases
 633
 634 lex_id3 module_dot buf =
 635 -- _trace ("lex_id3: "++[C# (currentChar# buf)]) $
 636  case expandWhile (is_id_char) buf of
 637   buf' ->
 638     case module_dot of
 639      Just _ ->
 640        end_lex_id module_dot (mk_var_token lexeme) (stepOverLexeme buf')
 641      Nothing ->
 642        case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of
 643          Just kwd_token -> kwd_token           : lexIface new_buf
 644          Nothing        -> mk_var_token lexeme : lexIface new_buf
 645     where
 646      lexeme  = lexemeToFastString buf'
 647      new_buf = stepOverLexeme buf'
 648
 649
 650 {- OLD:
 651 lex_id2 module_dot [] ('[': ']': cs) = end_lex_id module_dot (ITconid SLIT("[]")) cs
 652 lex_id2 module_dot [] ('(': ')': cs) = end_lex_id module_dot (ITconid SLIT("()")) cs
 653 lex_id2 module_dot [] ('(': ',': cs) = lex_tuple module_dot cs
 654 lex_id2 module_dot [] (':' : cs)     = lex_id3 module_dot [':'] cs
 655 lex_id2 module_dot xs cs             = lex_id3 module_dot xs cs
 656 -}
 657
 658
 659 -- Dealt with [], (), : special cases
 660
 661 {-
 662 lex_id3 module_dot len_xs xs cs =
 663  case my_span' (is_id_char) cs of
 664    (xs1,len_xs1,rest) ->
 665     case module_dot of
 666      Just m  -> end_lex_id (Just m) (mk_var_token rxs) rest --OLD:_PK_ (reverse xs))) rest
 667      Nothing ->
 668       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM rxs of
 669        Just kwd_token -> kwd_token          : lexIface rest
 670        other          -> token : lexIface cs end_lex_id Nothing (mk_var_token rxs) rest
 671     where
 672      rxs = packNChars (len_xs+len_xs1) (xs++xs1) -- OLD: _PK_ (reverse xs)
 673 -}
 674 mk_var_token pk_str =
 675      let
 676       f = _HEAD_ pk_str
 677      in
 678      --
 679      -- These tests assume a non- ISO-8859-1 impl of isUpper&isLower,
 680      -- remove the second half of disjunction when using a 1.3 prelude.
 681      --
 682      if      isUpper f    then ITconid pk_str
 683      else if isLower f    then ITvarid pk_str
 684      else if f == ':'     then ITconsym pk_str
 685      else if isLowerISO f then ITvarid pk_str
 686      else if isUpperISO f then ITconid pk_str
 687      else ITvarsym pk_str
 688
 689 {-
 690     mk_var_token xs@(f:_) | isUpper f || isUpperISO f = ITconid n
 691                           | f == ':'              = ITconsym n
 692                           | isAlpha f             = ITvarid n
 693                           | otherwise             = ITvarsym n
 694                 where
 695                       n = _PK_ xs
 696 -}
 697
 698 end_lex_id Nothing token buf  = token : lexIface buf
 699 end_lex_id (Just m) token buf =
 700  case token of
 701    ITconid n  -> ITqconid  (m,n)         : lexIface buf
 702    ITvarid n  -> ITqvarid  (m,n)         : lexIface buf
 703    ITconsym n -> ITqconsym (m,n)         : lexIface buf
 704    ITvarsym n -> ITqvarsym (m,n)         : lexIface buf
 705    ITbang     -> ITqvarsym (m,SLIT("!")) : lexIface buf
 706    _          -> ITunknown (show token)  : lexIface buf
 707
 708 ------------
 709 ifaceKeywordsFM :: UniqFM IfaceToken
 710 ifaceKeywordsFM = listToUFM $
 711       map (\ (x,y) -> (_PK_ x,y))
 712        [("/\\_",                ITbiglam)
 713        ,("@_",                  ITatsign)
 714        ,("letrec_",             ITletrec)
 715        ,("interface_",          ITinterface)
 716        ,("usages_",             ITusages)
 717        ,("versions_",           ITversions)
 718        ,("exports_",            ITexports)
 719        ,("instance_modules_",   ITinstance_modules)
 720        ,("instances_",          ITinstances)
 721        ,("fixities_",           ITfixities)
 722        ,("declarations_",       ITdeclarations)
 723        ,("pragmas_",            ITpragmas)
 724        ,("forall_",             ITforall)
 725        ,("U_",                  ITunfold)
 726        ,("A_",                  ITarity)
 727        ,("coerce_in_",          ITcoerce_in)
 728        ,("coerce_out_",         ITcoerce_out)
 729        ,("bot_",                ITbottom)
 730        ,("integer_",            ITinteger_lit)
 731        ,("rational_",           ITrational_lit)
 732        ,("addr_",               ITaddr_lit)
 733        ,("float_",              ITfloat_lit)
 734        ,("string_",             ITstring_lit)
 735        ,("litlit_",             ITlit_lit)
 736        ,("ccall_",              ITccall (False, False))
 737        ,("ccall_GC_",           ITccall (False, True))
 738        ,("casm_",               ITccall (True,  False))
 739        ,("casm_GC_",            ITccall (True,  True))
 740        ]
 741
 742 haskellKeywordsFM = listToUFM $
 743       map (\ (x,y) -> (_PK_ x,y))
 744       [ ("data",                ITdata)
 745        ,("type",                ITtype)
 746        ,("newtype",             ITnewtype)
 747        ,("class",               ITclass)
 748        ,("where",               ITwhere)
 749        ,("instance",            ITinstance)
 750        ,("infixl",              ITinfixl)
 751        ,("infixr",              ITinfixr)
 752        ,("infix",               ITinfix)
 753        ,("case",                ITcase)
 754        ,("case#",               ITprim_case)
 755        ,("of",                  ITof)
 756        ,("in",                  ITin)
 757        ,("let",                 ITlet)
 758        ,("deriving",            ITderiving)
 759
 760        ,("->",                  ITrarrow)
 761        ,("\\",                  ITlam)
 762        ,("|",                   ITvbar)
 763        ,("!",                   ITbang)
 764        ,("=>",                  ITdarrow)
 765        ,("=",                   ITequal)
 766        ]
 767
 768
 769 -- doDiscard rips along really fast looking for a double semicolon,
 770 -- indicating the end of the pragma we're skipping
 771 doDiscard inStr buf =
 772 -- _trace (show (C# (currentChar# buf))) $
 773  case currentChar# buf of
 774    ';'# ->
 775      if not inStr then
 776        case lookAhead# buf 1# of
 777         ';'# -> incLexeme (incLexeme buf)
 778         _    -> doDiscard inStr (incLexeme buf)
 779      else
 780        doDiscard inStr (incLexeme buf)
 781    '"'# ->
 782        let
 783         odd_slashes buf flg i# =
 784           case lookAhead# buf i# of
 785            '\\'# -> odd_slashes buf (not flg) (i# -# 1#)
 786            _     -> flg
 787        in
 788        case lookAhead# buf (negateInt# 1#) of --backwards, actually
 789          '\\'# -> -- escaping something..
 790            if odd_slashes buf True (negateInt# 2#) then
 791                -- odd number of slashes, " is escaped.
 792               doDiscard inStr (incLexeme buf)
 793            else
 794                -- even number of slashes, \ is escaped.
 795               doDiscard (not inStr) (incLexeme buf)
 796          _ -> case inStr of -- forced to avoid build-up
 797                True  -> doDiscard False (incLexeme buf)
 798                False -> doDiscard True  (incLexeme buf)
 799    _ -> doDiscard inStr (incLexeme buf)
 800
 801 \end{code}
 802
 803 begin{code}
 804 my_span :: (a -> Bool) -> [a] -> ([a],[a])
 805 my_span p xs = go [] xs
 806   where
 807     go so_far (x:xs') | p x = go (x:so_far) xs'
 808     go so_far xs            = (reverse so_far, xs)
 809
 810 my_span' :: (a -> Bool) -> [a] -> ([a],Int,[a])
 811 my_span' p xs = go [] 0 xs
 812   where
 813     go so_far n (x:xs') | p x = go (x:so_far) (n+1) xs'
 814     go so_far n xs            = (reverse so_far,n, xs)
 815 end{code}
 816
 817
 818 %************************************************************************
 819 %*                                                                      *
 820 \subsection{Other utility functions
 821 %*                                                                      *
 822 %************************************************************************
 823
 824 \begin{code}
 825 type IfM a = MaybeErr a Error
 826
 827 returnIf   :: a -> IfM a
 828 thenIf     :: IfM a -> (a -> IfM b) -> IfM b
 829 happyError :: Int -> [IfaceToken] -> IfM a
 830
 831 returnIf a = Succeeded a
 832
 833 thenIf (Succeeded a) k = k a
 834 thenIf (Failed  err) _ = Failed err
 835
 836 happyError ln toks = Failed (ifaceParseErr ln toks)
 837
 838 -----------------------------------------------------------------
 839
 840 ifaceParseErr ln toks sty
 841   = hsep [ptext SLIT("Interface-file parse error: line"), int ln, ptext SLIT("toks="), text (show (take 10 toks))]
 842 \end{code}