[project @ 2000-07-06 14:08:31 by simonmar]

[ghc-hetmet.git] / ghc / compiler / parser / Lex.lhs
diff --git a/ghc/compiler/parser/Lex.lhs b/ghc/compiler/parser/Lex.lhs

index 18a2549..9a2ab9c 100644 (file)
--- a/ghc/compiler/parser/Lex.lhs
+++ b/ghc/compiler/parser/Lex.lhs
@@ -16,11 +16,10 @@ An example that provokes the error is
  --------------------------------------------------------
  
  \begin{code}
-{-# OPTIONS -#include "ctypes.h" #-}
  
  module Lex (
  
-       ifaceParseErr,
+       ifaceParseErr, srcParseErr,
  
         -- Monad for parser
         Token(..), lexer, ParseResult(..), PState(..),
@@ -34,35 +33,26 @@ module Lex (
  
  #include "HsVersions.h"
  
-import Char            ( ord, isSpace, toUpper )
+import Char            ( isSpace, toUpper )
  import List             ( isSuffixOf )
  
-import IdInfo          ( InlinePragInfo(..), CprInfo(..) )
-import Name            ( isLowerISO, isUpperISO )
-import PrelMods                ( mkTupNameStr, mkUbxTupNameStr )
-import CmdLineOpts     ( opt_IgnoreIfacePragmas, opt_HiVersion, opt_NoHiCheck )
+import IdInfo          ( InlinePragInfo(..) )
+import PrelNames       ( mkTupNameStr )
+import CmdLineOpts     ( opt_HiVersion, opt_NoHiCheck )
  import Demand          ( Demand(..) {- instance Read -} )
-import UniqFM           ( UniqFM, listToUFM, lookupUFM)
-import BasicTypes      ( NewOrData(..) )
+import UniqFM           ( listToUFM, lookupUFM )
+import BasicTypes      ( NewOrData(..), Boxity(..) )
  import SrcLoc          ( SrcLoc, incSrcLine, srcLocFile, srcLocLine,
                           replaceSrcLine, mkSrcLoc )
  
-import Maybes          ( MaybeErr(..) )
  import ErrUtils                ( Message )
  import Outputable
  
  import FastString
  import StringBuffer
  import GlaExts
-import ST              ( runST )
-
-#if __GLASGOW_HASKELL__ >= 303
-import Bits
-import Word
-#endif
-
+import Ctype
  import Char            ( chr )
-import Addr
  import PrelRead        ( readRational__ ) -- Glasgow non-std
  \end{code}
  
@@ -128,6 +118,9 @@ data Token
    | ITlabel
    | ITdynamic
    | ITunsafe
+  | ITwith
+  | ITstdcallconv
+  | ITccallconv
  
    | ITinterface                        -- interface keywords
    | IT__export
@@ -142,20 +135,25 @@ data Token
    | ITbottom
    | ITinteger_lit 
    | ITfloat_lit
+  | ITword_lit
+  | ITword64_lit
+  | ITint64_lit
    | ITrational_lit
    | ITaddr_lit
+  | ITlabel_lit
    | ITlit_lit
    | ITstring_lit
    | ITtypeapp
-  | ITonce
-  | ITmany
+  | ITusage
+  | ITfuall
    | ITarity 
    | ITspecialise
    | ITnocaf
    | ITunfold InlinePragInfo
    | ITstrict ([Demand], Bool)
    | ITrules
-  | ITcprinfo (CprInfo)
+  | ITcprinfo
+  | ITdeprecated
    | IT__scc
    | ITsccAllCafs
  
@@ -164,6 +162,7 @@ data Token
    | ITinline_prag
    | ITnoinline_prag
    | ITrules_prag
+  | ITdeprecated_prag
    | ITline_prag
    | ITclose_prag
  
@@ -206,6 +205,8 @@ data Token
    | ITqvarsym (FAST_STRING,FAST_STRING)
    | ITqconsym (FAST_STRING,FAST_STRING)
  
+  | ITipvarid FAST_STRING      -- GHC extension: implicit param: ?x
+
    | ITpragma StringBuffer
  
    | ITchar       Char 
@@ -222,7 +223,7 @@ data Token
  
    | ITunknown String           -- Used when the lexer can't make sense of it
    | ITeof                      -- end of file token
-  deriving Text -- debugging
+  deriving Show -- debugging
  \end{code}
  
  -----------------------------------------------------------------------------
@@ -239,7 +240,8 @@ pragmaKeywordsFM = listToUFM $
         ( "NOTINLINE",  ITnoinline_prag ),
         ( "LINE",       ITline_prag ),
         ( "RULES",      ITrules_prag ),
-       ( "RULEZ",      ITrules_prag )  -- american spelling :-)
+       ( "RULEZ",      ITrules_prag ), -- american spelling :-)
+       ( "DEPRECATED", ITdeprecated_prag )
         ]
  
  haskellKeywordsFM = listToUFM $
@@ -272,7 +274,7 @@ haskellKeywordsFM = listToUFM $
         ( "_scc_",      ITscc )
       ]
  
-
+-- IMPORTANT: Keep this in synch with ParseIface.y's var_fs production! (SUP)
  ghcExtensionKeywordsFM = listToUFM $
         map (\ (x,y) -> (_PK_ x,y))
       [ ( "forall",     ITforall ),
@@ -281,6 +283,9 @@ ghcExtensionKeywordsFM = listToUFM $
         ( "label",      ITlabel ),
         ( "dynamic",    ITdynamic ),
         ( "unsafe",     ITunsafe ),
+       ( "with",       ITwith ),
+       ( "stdcall",    ITstdcallconv),
+       ( "ccall",      ITccallconv),
          ("_ccall_",    ITccall (False, False, False)),
          ("_ccall_GC_", ITccall (False, False, True)),
          ("_casm_",     ITccall (False, True,  False)),
@@ -300,18 +305,23 @@ ghcExtensionKeywordsFM = listToUFM $
         ("__bot",               ITbottom),
         ("__integer",           ITinteger_lit),
         ("__float",             ITfloat_lit),
+       ("__int64",             ITint64_lit),
+       ("__word",              ITword_lit),
+       ("__word64",            ITword64_lit),
         ("__rational",          ITrational_lit),
         ("__addr",              ITaddr_lit),
+       ("__label",             ITlabel_lit),
         ("__litlit",            ITlit_lit),
         ("__string",            ITstring_lit),
         ("__a",                 ITtypeapp),
-       ("__o",                 ITonce),
-       ("__m",                 ITmany),
+       ("__u",                 ITusage),
+       ("__fuall",             ITfuall),
         ("__A",                 ITarity),
         ("__P",                 ITspecialise),
         ("__C",                 ITnocaf),
         ("__R",                 ITrules),
-        ("__u",                        ITunfold NoInlinePragInfo),
+        ("__D",                        ITdeprecated),
+        ("__U",                        ITunfold NoInlinePragInfo),
         
          ("__ccall",            ITccall (False, False, False)),
          ("__ccall_GC",         ITccall (False, False, True)),
@@ -366,7 +376,8 @@ lexer cont buf s@(PState{
                 })
  
         -- first, start a new lexeme and lose all the whitespace
-  = tab line bol atbol (stepOverLexeme buf)
+  =  _scc_ "Lexer" 
+  tab line bol atbol (stepOverLexeme buf)
    where
         line = srcLocLine loc
  
@@ -388,9 +399,8 @@ lexer cont buf s@(PState{
                           if next `eqChar#` '-'# then trundle (n +# 1#)
                           else if is_symbol next || n <# 2#
                                 then is_a_token
-                               else case untilChar# (stepOnBy# buf n) '\n'# of 
-                                   { buf' -> tab y bol atbol (stepOverLexeme buf')
-                                   }
+                               else tab y bol atbol 
+                                        (stepOnUntilChar# (stepOnBy# buf n) '\n'#)
                     in trundle 1#
  
                 -- comments and pragmas.  We deal with LINE pragmas here,
@@ -406,14 +416,21 @@ lexer cont buf s@(PState{
                   let lexeme = mkFastString -- ToDo: too slow
                                   (map toUpper (lexemeToString buf2)) in
                   case lookupUFM pragmaKeywordsFM lexeme of
-                       Just ITline_prag -> line_prag (lexer cont) buf2 s'
+                       Just ITline_prag -> 
+                          line_prag skip_to_end buf2 s'
                         Just other -> is_a_token
-                       Nothing -> skip_to_end (stepOnBy# buf 2#)
+                       Nothing -> skip_to_end (stepOnBy# buf 2#) s'
                   }}
-               
-               else skip_to_end (stepOnBy# buf 2#)
+
+               else skip_to_end (stepOnBy# buf 2#) s'
                 where
-                   skip_to_end buf = nested_comment (lexer cont) buf s'
+                   skip_to_end = nested_comment (lexer cont)
+
+               -- special GHC extension: we grok cpp-style #line pragmas
+           '#'# | lexemeIndex buf ==# bol ->   -- the '#' must be in column 0
+               line_prag next_line (stepOn buf) s'
+               where
+               next_line buf = lexer cont (stepOnUntilChar# buf '\n'#)
  
                 -- tabs have been expanded beforehand
             c | is_space c -> tab y bol atbol (stepOn buf)
@@ -427,23 +444,27 @@ lexer cont buf s@(PState{
                             | otherwise    = lexToken cont glaexts buf s'
  
  -- {-# LINE .. #-} pragmas.  yeuch.
-line_prag cont buf =
+line_prag cont buf s@PState{loc=loc} =
    case expandWhile# is_space buf               of { buf1 ->
    case scanNumLit 0 (stepOverLexeme buf1)      of { (line,buf2) ->
    -- subtract one: the line number refers to the *following* line.
    let real_line = line - 1 in
    case fromInteger real_line                   of { i@(I# l) -> 
+       -- ToDo, if no filename then we skip the newline.... d'oh
    case expandWhile# is_space buf2              of { buf3 ->
    case currentChar# buf3                       of
       '\"'#{-"-} -> 
         case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 ->
-       let file = lexemeToFastString buf4 in
-       \s@PState{loc=loc} -> skipToEnd buf4 s{loc = mkSrcLoc file i}
+       let 
+           file = lexemeToFastString buf4 
+           new_buf = stepOn (stepOverLexeme buf4)
+       in
+       if nullFastString file
+               then cont new_buf s{loc = replaceSrcLine loc l}
+               else cont new_buf s{loc = mkSrcLoc file i}
         }
-     other -> \s@PState{loc=loc} -> skipToEnd buf3 s{loc = replaceSrcLine loc l}
+     _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l}
    }}}}
-  where
-       skipToEnd buf = nested_comment cont buf
  
  nested_comment :: P a -> P a
  nested_comment cont buf = loop buf
@@ -506,7 +527,6 @@ lexBOL cont buf s@(PState{
  lexToken :: (Token -> P a) -> Int# -> P a
  lexToken cont glaexts buf =
   --trace "lexToken" $
- _scc_ "Lexer" 
    case currentChar# buf of
  
      -- special symbols ----------------------------------------------------
@@ -524,11 +544,10 @@ lexToken cont glaexts buf =
      '}'# -> \ s@PState{context = ctx} ->
             case ctx of 
                 (_:ctx') -> cont ITccurly (incLexeme buf) s{context=ctx'}
-               _ -> lexError "too many '}'s" buf s
+               _        -> lexError "too many '}'s" buf s
  
-    '#'# | flag glaexts 
-        -> case lookAhead# buf 1# of
-               ')'# -> cont ITcubxparen (setCurrentPos# buf 2#)
+    '#'# -> case lookAhead# buf 1# of
+               ')'#  | flag glaexts -> cont ITcubxparen (setCurrentPos# buf 2#)
                 '-'# -> case lookAhead# buf 2# of
                            '}'# -> cont ITclose_prag (setCurrentPos# buf 3#)
                            _    -> lex_sym cont (incLexeme buf)
@@ -563,8 +582,8 @@ lexToken cont glaexts buf =
                         lex_demand cont (stepOnUntil (not . isSpace) 
                                         (stepOnBy# buf 3#)) -- past __S
                     'M'# -> 
-                       lex_cpr cont (stepOnUntil (not . isSpace) 
-                                    (stepOnBy# buf 3#)) -- past __M
+                       cont ITcprinfo (stepOnBy# buf 3#)       -- past __M
+
                     's'# -> 
                         case prefixMatch (stepOnBy# buf 3#) "cc" of
                                Just buf' -> lex_scc cont (stepOverLexeme buf')
@@ -588,6 +607,8 @@ lexToken cont glaexts buf =
                trace "lexIface: misplaced NUL?" $ 
                cont (ITunknown "\NUL") (stepOn buf)
  
+    '?'# | flag glaexts && is_lower (lookAhead# buf 1#) ->
+           lex_ip cont (incLexeme buf)
      c | is_digit  c -> lex_num cont glaexts 0 buf
        | is_symbol c -> lex_sym cont buf
        | is_upper  c -> lex_con cont glaexts buf
@@ -606,7 +627,7 @@ flag _  = True
  lex_prag cont buf
    = case expandWhile# is_space buf of { buf1 ->
      case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 -> 
-    let lexeme = lexemeToFastString buf2 in
+    let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in
      case lookupUFM pragmaKeywordsFM lexeme of
         Just kw -> cont kw (mergeLexemes buf buf2)
         Nothing -> panic "lex_prag"
@@ -625,9 +646,9 @@ lex_string cont glaexts s buf
  
         -- ignore \& in a string, deal with string gaps
         '\\'# | next_ch `eqChar#` '&'# 
-               -> lex_string cont glaexts s (setCurrentPos# buf 2#)
+               -> lex_string cont glaexts s buf'
               | is_space next_ch
-               -> lex_stringgap cont glaexts s buf'
+               -> lex_stringgap cont glaexts s (incLexeme buf)
  
             where next_ch = lookAhead# buf 1#
                   buf' = setCurrentPos# buf 2#
@@ -705,8 +726,8 @@ readNum cont buf is_digit base conv = read buf 0
  
  is_hexdigit c 
         =  is_digit c 
-       || (c `geChar#` 'a'# && c `leChar#` 'h'#)
-       || (c `geChar#` 'A'# && c `leChar#` 'H'#)
+       || (c `geChar#` 'a'# && c `leChar#` 'f'#)
+       || (c `geChar#` 'A'# && c `leChar#` 'F'#)
  
  hex c | is_digit c = ord# c -# ord# '0'#
        | otherwise  = ord# (to_lower c) -# ord# 'a'# +# 10#
@@ -786,23 +807,6 @@ lex_demand cont buf =
     = case read_em [] buf of
        (stuff, rest) -> read_em (WwUnpack new_or_data wrapper_unpacks stuff : acc) rest
  
-lex_cpr cont buf = 
- case read_em [] buf of { (cpr_inf,buf') -> 
-   ASSERT ( null (tail cpr_inf) )
-   cont (ITcprinfo $ head cpr_inf) buf'
- }
- where
-   -- code snatched from lex_demand above
-  read_em acc buf = 
-   case currentChar# buf of
-    '-'# -> read_em (NoCPRInfo : acc) (stepOn buf)
-    '('# -> do_unpack acc (stepOn buf)
-    ')'# -> (reverse acc, stepOn buf)
-    _    -> (reverse acc, buf)
-
-  do_unpack acc buf
-   = case read_em [] buf of
-      (stuff, rest) -> read_em ((CPRInfo stuff)  : acc) rest
  
  ------------------
  lex_scc cont buf =
@@ -862,41 +866,22 @@ after_lexnum cont glaexts i buf
  
  lex_cstring cont buf =
   case expandUntilMatch (stepOverLexeme buf) "\'\'" of
-   buf' -> cont (ITlitlit (lexemeToFastString 
+   Just buf' -> cont (ITlitlit (lexemeToFastString 
                                 (setCurrentPos# buf' (negateInt# 2#))))
-               (mergeLexemes buf buf')
-
-------------------------------------------------------------------------------
--- Character Classes
-
-is_ident, is_symbol, is_any, is_upper, is_digit :: Char# -> Bool
-
-{-# INLINE is_ctype #-}
-#if __GLASGOW_HASKELL__ >= 303
-is_ctype :: Word8 -> Char# -> Bool
-is_ctype mask = \c ->
-   (indexWord8OffAddr (``char_types'' :: Addr) (ord (C# c)) .&. mask) /= 0
-#else
-is_ctype :: Int -> Char# -> Bool
-is_ctype (I# mask) = \c ->
-    let (A# ctype) = ``char_types'' :: Addr
-       flag_word  = int2Word# (ord# (indexCharOffAddr# ctype (ord# c)))
-    in
-       (flag_word `and#` (int2Word# mask)) `neWord#` (int2Word# 0#)
-#endif
-
-is_ident  = is_ctype 1
-is_symbol = is_ctype 2
-is_any    = is_ctype 4
-is_space  = is_ctype 8
-is_upper  = is_ctype 16
-is_digit  = is_ctype 32
+                  (mergeLexemes buf buf')
+   Nothing   -> lexError "unterminated ``" buf
  
  -----------------------------------------------------------------------------
  -- identifiers, symbols etc.
  
+lex_ip cont buf =
+ case expandWhile# is_ident buf of
+   buf' -> cont (ITipvarid lexeme) buf'
+          where lexeme = lexemeToFastString buf'
+
  lex_id cont glaexts buf =
- case expandWhile# is_ident buf of { buf1 -> 
+ let buf1 = expandWhile# is_ident buf in
+ seq buf1 $
  
   case (if flag glaexts 
         then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes
@@ -909,7 +894,7 @@ lex_id cont glaexts buf =
                           cont kwd_token buf';
         Nothing        -> 
  
- let var_token = cont (mk_var_token lexeme) buf' in
+ let var_token = cont (ITvarid lexeme) buf' in
  
   if not (flag glaexts)
     then var_token
@@ -919,7 +904,7 @@ lex_id cont glaexts buf =
         Just kwd_token -> cont kwd_token buf';
         Nothing        -> var_token
  
- }}}}
+ }}}
  
  lex_sym cont buf =
   case expandWhile# is_symbol buf of
@@ -981,10 +966,9 @@ lex_id3 cont glaexts mod buf just_a_conid
         -- real lexeme is M.<sym>
         new_buf = mergeLexemes buf buf'
       in
-     case lookupUFM haskellKeySymsFM lexeme of {
-       Just kwd_token -> just_a_conid; -- avoid M.:: etc.
-       Nothing        -> cont (mk_qvar_token mod lexeme) new_buf
-     }}
+     cont (mk_qvar_token mod lexeme) new_buf
+       -- wrong, but arguably morally right: M... is now a qvarsym
+     }
  
    | otherwise   =
       let 
@@ -1000,15 +984,13 @@ lex_id3 cont glaexts mod buf just_a_conid
       let
        lexeme  = lexemeToFastString buf'
        new_buf = mergeLexemes buf buf'
+      is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf
       in
       case _scc_ "Lex.haskellKeyword" lookupUFM haskellKeywordsFM lexeme of {
             Just kwd_token -> just_a_conid; -- avoid M.where etc.
-           Nothing        -> 
-     if flag glaexts
-       then case lookupUFM ghcExtensionKeywordsFM lexeme of {
-           Just kwd_token -> just_a_conid;
-           Nothing        -> cont (mk_qvar_token mod lexeme) new_buf }
-       else just_a_conid
+           Nothing        -> is_a_qvarid
+       -- TODO: special ids (as, qualified, hiding) shouldn't be
+       -- recognised as keywords here.  ie.  M.as is a qualified varid.
       }}}
  
  
@@ -1019,9 +1001,6 @@ slurp_trailing_hashes buf glaexts
  
  mk_var_token pk_str
    | is_upper f         = ITconid pk_str
-       -- _[A-Z] is treated as a constructor in interface files.
-  | f `eqChar#` '_'# && not (_NULL_ tl) 
-       && (case _HEAD_ tl of { C# g -> is_upper g }) = ITconid pk_str
    | is_ident f         = ITvarid pk_str
    | f `eqChar#` ':'#   = ITconsym pk_str
    | otherwise          = ITvarsym pk_str
@@ -1048,7 +1027,7 @@ lex_tuple cont mod buf back_off =
     go n buf =
      case currentChar# buf of
        ','# -> go (n+1) (stepOn buf)
-      ')'# -> cont (ITqconid (mod, snd (mkTupNameStr n))) (stepOn buf)
+      ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf)
        _    -> back_off
  
  lex_ubx_tuple cont mod buf back_off =
@@ -1058,7 +1037,7 @@ lex_ubx_tuple cont mod buf back_off =
      case currentChar# buf of
        ','# -> go (n+1) (stepOn buf)
        '#'# -> case lookAhead# buf 1# of
-               ')'# -> cont (ITqconid (mod, snd (mkUbxTupNameStr n)))
+               ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n)))
                                  (stepOnBy# buf 2#)
                 _    -> back_off
        _    -> back_off
@@ -1203,13 +1182,25 @@ h = h
         - we still need to insert another '}' followed by a ';',
           hence the atbol trick.
  
+There's also a special hack in here to deal with
+
+       do
+          ....
+          e $ do
+          blah
+
+i.e. the inner context is at the same indentation level as the outer
+context.  This is strictly illegal according to Haskell 98, but
+there's a lot of existing code using this style and it doesn't make
+any sense to disallow it, since empty 'do' lists don't make sense.
  -}
  
-layoutOn :: P ()
-layoutOn buf s@(PState{ bol = bol, context = ctx }) =
+layoutOn :: Bool -> P ()
+layoutOn strict buf s@(PState{ bol = bol, context = ctx }) =
      let offset = lexemeIndex buf -# bol in
      case ctx of
-       Layout prev_off : _ | prev_off >=# offset ->
+       Layout prev_off : _ 
+          | if strict then prev_off >=# offset else prev_off ># offset ->
                 --trace ("layout on, column: " ++  show (I# offset)) $
                 POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } ()
         other -> 
@@ -1221,10 +1212,10 @@ layoutOff buf s@(PState{ context = ctx }) =
      POk s{ context = NoLayout:ctx } ()
  
  popContext :: P ()
-popContext = \ buf s@(PState{ context = ctx }) ->
+popContext = \ buf s@(PState{ context = ctx, loc = loc }) ->
    case ctx of
         (_:tl) -> POk s{ context = tl } ()
-       []    -> panic "Lex.popContext: empty context"
+       []     -> PFailed (srcParseErr buf loc)
  
  {- 
   Note that if the name of the file we're processing ends
@@ -1266,4 +1257,17 @@ ifaceVersionErr hi_vers l toks
          Nothing -> ptext SLIT("pre ghc-3.02 version")
         Just v  -> ptext SLIT("version") <+> integer v
  
+-----------------------------------------------------------------------------
+
+srcParseErr :: StringBuffer -> SrcLoc -> Message
+srcParseErr s l
+  = hcat [ppr l, 
+         if null token 
+            then ptext SLIT(": parse error (possibly incorrect indentation)")
+            else hcat [ptext SLIT(": parse error on input "),
+                       char '`', text token, char '\'']
+    ]
+  where 
+       token = lexemeToString s
+
  \end{code}