White space only

[ghc-hetmet.git] / compiler / parser / Lexer.x
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x

index b9abf7a..b3b4804 100644 (file)
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -12,7 +12,6 @@
  -----------------------------------------------------------------------------
  
  --   ToDo / known bugs:
---    - Unicode
  --    - parsing integers is a bit slow
  --    - readRational is a bit slow
  --
@@ -20,8 +19,19 @@
  --    - M... should be 3 tokens, not 1.
  --    - pragma-end should be only valid in a pragma
  
+--   qualified operator NOTES.
+--   
+--   - If M.(+) is a single lexeme, then..
+--     - Probably (+) should be a single lexeme too, for consistency.
+--       Otherwise ( + ) would be a prefix operator, but M.( + ) would not be.
+--     - But we have to rule out reserved operators, otherwise (..) becomes
+--       a different lexeme.
+--     - Should we therefore also rule out reserved operators in the qualified
+--       form?  This is quite difficult to achieve.  We don't do it for
+--       qualified varids.
+
  {
-{-# OPTIONS -w #-}
+{-# OPTIONS -Wwarn -w #-}
  -- The above warning supression flag is a temporary kludge.
  -- While working on this module you are encouraged to remove it and fix
  -- any warnings in the module. See
@@ -31,6 +41,8 @@
  -- Note that Alex itself generates code with with some unused bindings and
  -- without type signatures, so removing the flag might not be possible.
  
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
  module Lexer (
     Token(..), lexer, pragState, mkPState, PState(..),
     P(..), ParseResult(..), getSrcLoc, 
@@ -39,7 +51,8 @@ module Lexer (
     popContext, pushCurrentContext, setLastToken, setSrcLoc,
     getLexState, popLexState, pushLexState,
     extension, standaloneDerivingEnabled, bangPatEnabled,
-   addWarning
+   addWarning,
+   lexTokenStream
    ) where
  
  import Bag
@@ -47,7 +60,6 @@ import ErrUtils
  import Outputable
  import StringBuffer
  import FastString
-import FastTypes
  import SrcLoc
  import UniqFM
  import DynFlags
@@ -56,11 +68,8 @@ import Util          ( maybePrefixMatch, readRational )
  
  import Control.Monad
  import Data.Bits
-import Data.Char       ( chr, ord, isSpace )
+import Data.Char
  import Data.Ratio
-import Debug.Trace
-
-import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  }
  
  $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
@@ -146,12 +155,12 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  -- space followed by a Haddock comment symbol (docsym) (in which case we'd
  -- have a Haddock comment). The rules then munch the rest of the line.
  
-"-- " ~[$docsym \#] .* ;
-"--" [^$symbol : \ ] .* ;
+"-- " ~[$docsym \#] .* { lineCommentToken }
+"--" [^$symbol : \ ] .* { lineCommentToken }
  
  -- Next, match Haddock comments if no -haddock flag
  
-"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ;
+"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
  
  -- Now, when we've matched comments that begin with 2 dashes and continue
  -- with a different character, we need to match comments that begin with three
@@ -159,17 +168,17 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  -- make sure that the first non-dash character isn't a symbol, and munch the
  -- rest of the line.
  
-"---"\-* [^$symbol :] .* ;
+"---"\-* [^$symbol :] .* { lineCommentToken }
  
  -- Since the previous rules all match dashes followed by at least one
  -- character, we also need to match a whole line filled with just dashes.
  
-"--"\-* / { atEOL } ;
+"--"\-* / { atEOL } { lineCommentToken }
  
  -- We need this rule since none of the other single line comment rules
  -- actually match this case.
  
-"-- " / { atEOL } ;
+"-- " / { atEOL } { lineCommentToken }
  
  -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
  -- blank lines) until we find a non-whitespace character, then do layout
@@ -209,7 +218,8 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  
  <0,option_prags> \n                            { begin bol }
  
-"{-#" $whitechar* (line|LINE)          { begin line_prag2 }
+"{-#" $whitechar* (line|LINE) / { notFollowedByPragmaChar }
+                            { begin line_prag2 }
  
  -- single-line line pragmas, of the form
  --    # <line> "<file>" <extra-stuff> \n
@@ -225,59 +235,80 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
     -- NOTE: accept -} at the end of a LINE pragma, for compatibility
     -- with older versions of GHC which generated these.
  
--- We only want RULES pragmas to be picked up when explicit forall
--- syntax is enabled is on, because the contents of the pragma always
--- uses it. If it's not on then we're sure to get a parse error.
--- (ToDo: we should really emit a warning when ignoring pragmas)
--- XXX Now that we can enable this without the -fglasgow-exts hammer,
--- is it better just to let the parse error happen?
-<0>
-  "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
-
  <0,option_prags> {
-  "{-#" $whitechar* (INLINE|inline)    { token (ITinline_prag True) }
-  "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
+  "{-#" $whitechar* (RULES|rules)  / { notFollowedByPragmaChar } { rulePrag }
+  "{-#" $whitechar* (INLINE|inline)     / { notFollowedByPragmaChar }
+                    { token (ITinline_prag True) }
+  "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline) / { notFollowedByPragmaChar }
                                         { token (ITinline_prag False) }
-  "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
+  "{-#" $whitechar* (INLINE|inline)
+        $whitechar+ (CONLIKE|conlike) / { notFollowedByPragmaChar }
+                                        { token (ITinline_conlike_prag True) }
+  "{-#" $whitechar* (NO(T)?INLINE|no(t?)inline)
+        $whitechar+ (CONLIKE|constructorlike) / { notFollowedByPragmaChar }
+                                        { token (ITinline_conlike_prag False) }
+  "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e) / { notFollowedByPragmaChar }
                                         { token ITspec_prag }
    "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
-       $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
+       $whitechar+ (INLINE|inline) / { notFollowedByPragmaChar }
+                    { token (ITspec_inline_prag True) }
    "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
-       $whitechar* (NO(T?)INLINE|no(t?)inline)
+       $whitechar+ (NO(T?)INLINE|no(t?)inline) / { notFollowedByPragmaChar }
                                         { token (ITspec_inline_prag False) }
-  "{-#" $whitechar* (SOURCE|source)    { token ITsource_prag }
-  "{-#" $whitechar* (DEPRECATED|deprecated)
+  "{-#" $whitechar* (SOURCE|source) / { notFollowedByPragmaChar }
+                    { token ITsource_prag }
+  "{-#" $whitechar* (WARNING|warning) / { notFollowedByPragmaChar }
+                                       { token ITwarning_prag }
+  "{-#" $whitechar* (DEPRECATED|deprecated) / { notFollowedByPragmaChar }
                                         { token ITdeprecated_prag }
-  "{-#" $whitechar* (SCC|scc)          { token ITscc_prag }
-  "{-#" $whitechar* (GENERATED|generated)
+  "{-#" $whitechar* (SCC|scc)  / { notFollowedByPragmaChar }
+                    { token ITscc_prag }
+  "{-#" $whitechar* (GENERATED|generated) / { notFollowedByPragmaChar }
                                         { token ITgenerated_prag }
-  "{-#" $whitechar* (CORE|core)                { token ITcore_prag }
-  "{-#" $whitechar* (UNPACK|unpack)    { token ITunpack_prag }
-
- "{-#"                                 { nested_comment lexToken }
+  "{-#" $whitechar* (CORE|core) / { notFollowedByPragmaChar }
+                    { token ITcore_prag }
+  "{-#" $whitechar* (UNPACK|unpack) / { notFollowedByPragmaChar }
+                    { token ITunpack_prag }
+  "{-#" $whitechar* (ANN|ann) / { notFollowedByPragmaChar }
+                    { token ITann_prag }
+
+  -- We ignore all these pragmas, but don't generate a warning for them
+  -- CFILES is a hugs-only thing.
+  "{-#" $whitechar* (OPTIONS_(HUGS|hugs|NHC98|nhc98|JHC|jhc|YHC|yhc|CATCH|catch|DERIVE|derive)|CFILES|cfiles|CONTRACT|contract) / { notFollowedByPragmaChar }
+                    { nested_comment lexToken }
  
    -- ToDo: should only be valid inside a pragma:
-  "#-}"                                { token ITclose_prag}
+  "#-}"                                { endPrag }
  }
  
  <option_prags> {
-  "{-#"  $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
-  "{-#"  $whitechar* (OPTIONS_GHC|options_ghc)
+  "{-#"  $whitechar* (OPTIONS|options) / { notFollowedByPragmaChar }
+                                        { lex_string_prag IToptions_prag }
+  "{-#"  $whitechar* (OPTIONS_GHC|options_ghc) / { notFollowedByPragmaChar }
                                          { lex_string_prag IToptions_prag }
    "{-#"  $whitechar* (OPTIONS_HADDOCK|options_haddock)
+                   / { notFollowedByPragmaChar }
                                           { lex_string_prag ITdocOptions }
    "-- #"                                 { multiline_doc_comment }
-  "{-#"  $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
-  "{-#"  $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
+  "{-#"  $whitechar* (LANGUAGE|language) / { notFollowedByPragmaChar }
+                                         { token ITlanguage_prag }
+  "{-#"  $whitechar* (INCLUDE|include) / { notFollowedByPragmaChar }
+                                         { lex_string_prag ITinclude_prag }
+}
+
+<0> {
+  -- In the "0" mode we ignore these pragmas
+  "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include) / { notFollowedByPragmaChar }
+                     { nested_comment lexToken }
  }
  
  <0> {
-  "-- #" .* ;
+  "-- #" .* { lineCommentToken }
  }
  
  <0,option_prags> {
-       -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ... 
-  "{-#" $whitechar* $idchar+           { nested_comment lexToken }
+  "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
+                    (nested_comment lexToken) }
  }
  
  -- '0' state: ordinary lexemes
@@ -359,13 +390,15 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
    @conid "#"+       / { ifExtension magicHashEnabled } { idtoken conid }
  }
  
--- ToDo: M.(,,,)
-
+-- ToDo: - move `var` and (sym) into lexical syntax?
+--       - remove backquote from $special?
  <0> {
-  @qual @varsym                        { idtoken qvarsym }
-  @qual @consym                        { idtoken qconsym }
-  @varsym                      { varsym }
-  @consym                      { consym }
+  @qual @varsym       / { ifExtension oldQualOps } { idtoken qvarsym }
+  @qual @consym       / { ifExtension oldQualOps } { idtoken qconsym }
+  @qual \( @varsym \) / { ifExtension newQualOps } { idtoken prefixqvarsym }
+  @qual \( @consym \) / { ifExtension newQualOps } { idtoken prefixqconsym }
+  @varsym                                          { varsym }
+  @consym                                          { consym }
  }
  
  -- For the normal boxed literals we need to be careful
@@ -451,6 +484,7 @@ data Token
    | ITunsafe
    | ITstdcallconv
    | ITccallconv
+  | ITprimcallconv
    | ITdotnet
    | ITmdo
    | ITfamily
@@ -460,16 +494,19 @@ data Token
  
         -- Pragmas
    | ITinline_prag Bool         -- True <=> INLINE, False <=> NOINLINE
+  | ITinline_conlike_prag Bool  -- same
    | ITspec_prag                        -- SPECIALISE   
    | ITspec_inline_prag Bool    -- SPECIALISE INLINE (or NOINLINE)
    | ITsource_prag
    | ITrules_prag
+  | ITwarning_prag
    | ITdeprecated_prag
    | ITline_prag
    | ITscc_prag
    | ITgenerated_prag
    | ITcore_prag                 -- hdaume: core annotations
    | ITunpack_prag
+  | ITann_prag
    | ITclose_prag
    | IToptions_prag String
    | ITinclude_prag String
@@ -520,6 +557,8 @@ data Token
    | ITqconid  (FastString,FastString)
    | ITqvarsym (FastString,FastString)
    | ITqconsym (FastString,FastString)
+  | ITprefixqvarsym (FastString,FastString)
+  | ITprefixqconsym (FastString,FastString)
  
    | ITdupipvarid   FastString  -- GHC extension: implicit param: ?x
  
@@ -569,6 +608,8 @@ data Token
    | ITdocSection      Int String -- a section heading
    | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
    | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
+  | ITlineComment     String     -- comment starting by "--"
+  | ITblockComment    String     -- comment in {- -}
  
  #ifdef DEBUG
    deriving Show -- debugging
@@ -591,6 +632,7 @@ isSpecial ITthreadsafe      = True
  isSpecial ITunsafe     = True
  isSpecial ITccallconv   = True
  isSpecial ITstdcallconv = True
+isSpecial ITprimcallconv = True
  isSpecial ITmdo                = True
  isSpecial ITfamily     = True
  isSpecial ITgroup   = True
@@ -606,6 +648,7 @@ isSpecial _             = False
  -- facilitates using a keyword in two different extensions that can be
  -- activated independently)
  --
+reservedWordsFM :: UniqFM (Token, Int)
  reservedWordsFM = listToUFM $
         map (\(x, y, z) -> (mkFastString x, (y, z)))
         [( "_",         ITunderscore,   0 ),
@@ -635,7 +678,7 @@ reservedWordsFM = listToUFM $
         ( "where",      ITwhere,        0 ),
         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
  
-    ( "forall",        ITforall,        bit explicitForallBit),
+    ( "forall",        ITforall,        bit explicitForallBit .|. bit inRulePragBit),
         ( "mdo",        ITmdo,           bit recursiveDoBit),
         ( "family",     ITfamily,        bit tyFamBit),
      ( "group",  ITgroup,     bit transformComprehensionsBit),
@@ -647,10 +690,11 @@ reservedWordsFM = listToUFM $
         ( "label",      ITlabel,         bit ffiBit),
         ( "dynamic",    ITdynamic,       bit ffiBit),
         ( "safe",       ITsafe,          bit ffiBit),
-       ( "threadsafe", ITthreadsafe,    bit ffiBit),
+       ( "threadsafe", ITthreadsafe,    bit ffiBit),  -- ToDo: remove
         ( "unsafe",     ITunsafe,        bit ffiBit),
         ( "stdcall",    ITstdcallconv,   bit ffiBit),
         ( "ccall",      ITccallconv,     bit ffiBit),
+       ( "prim",       ITprimcallconv,  bit ffiBit),
         ( "dotnet",     ITdotnet,        bit ffiBit),
  
         ( "rec",        ITrec,           bit arrowsBit),
@@ -676,16 +720,15 @@ reservedSymsFM = listToUFM $
         ,("!",   ITbang,     always)
  
          -- For data T (a::*) = MkT
-       ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
+       ,("*", ITstar, always) -- \i -> kindSigsEnabled i || tyFamEnabled i)
          -- For 'forall a . t'
-       ,(".", ITdot, explicitForallEnabled)
+       ,(".", ITdot,  always) -- \i -> explicitForallEnabled i || inRulePrag i)
  
         ,("-<",  ITlarrowtail, arrowsEnabled)
         ,(">-",  ITrarrowtail, arrowsEnabled)
         ,("-<<", ITLarrowtail, arrowsEnabled)
         ,(">>-", ITRarrowtail, arrowsEnabled)
  
-#if __GLASGOW_HASKELL__ >= 605
         ,("∷",   ITdcolon, unicodeSyntaxEnabled)
         ,("⇒",   ITdarrow, unicodeSyntaxEnabled)
         ,("∀",   ITforall, \i -> unicodeSyntaxEnabled i &&
@@ -696,7 +739,6 @@ reservedSymsFM = listToUFM $
          -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
          -- form part of a large operator.  This would let us have a better
          -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
-#endif
         ]
  
  -- -----------------------------------------------------------------------------
@@ -737,19 +779,27 @@ pop_and :: Action -> Action
  pop_and act span buf len = do popLexState; act span buf len
  
  {-# INLINE nextCharIs #-}
+nextCharIs :: StringBuffer -> (Char -> Bool) -> Bool
  nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
  
+notFollowedBy :: Char -> AlexAccPred Int
  notFollowedBy char _ _ _ (AI _ _ buf) 
    = nextCharIs buf (/=char)
  
+notFollowedBySymbol :: AlexAccPred Int
  notFollowedBySymbol _ _ _ (AI _ _ buf)
    = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
  
+notFollowedByPragmaChar :: AlexAccPred Int
+notFollowedByPragmaChar _ _ _ (AI _ _ buf)
+  = nextCharIs buf (\c -> not (isAlphaNum c || c == '_'))
+
  -- We must reject doc comments as being ordinary comments everywhere.
  -- In some cases the doc comment will be selected as the lexeme due to
  -- maximal munch, but not always, because the nested comment rule is
  -- valid in all states, but the doc-comment rules are only valid in
  -- the non-layout states.
+isNormalComment :: AlexAccPred Int
  isNormalComment bits _ _ (AI _ _ buf)
    | haddockEnabled bits = notFollowedByDocOrPragma
    | otherwise           = nextCharIs buf (/='#')
@@ -757,6 +807,7 @@ isNormalComment bits _ _ (AI _ _ buf)
      notFollowedByDocOrPragma
         = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
  
+spaceAndP :: StringBuffer -> (StringBuffer -> Bool) -> Bool
  spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
  
  {-
@@ -764,8 +815,10 @@ haddockDisabledAnd p bits _ _ (AI _ _ buf)
    = if haddockEnabled bits then False else (p buf)
  -}
  
+atEOL :: AlexAccPred Int
  atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
  
+ifExtension :: (Int -> Bool) -> AlexAccPred Int
  ifExtension pred bits _ _ _ = pred bits
  
  multiline_doc_comment :: Action
@@ -796,6 +849,11 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
              | otherwise -> input
            Nothing -> input
  
+lineCommentToken :: Action
+lineCommentToken span buf len = do
+  b <- extension rawTokenStreamEnabled
+  if b then strtoken ITlineComment span buf len else lexToken
+
  {-
    nested comments require traversing by hand, they can't be parsed
    using regular expressions.
@@ -803,20 +861,24 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
  nested_comment :: P (Located Token) -> Action
  nested_comment cont span _str _len = do
    input <- getInput
-  go (1::Int) input
+  go "" (1::Int) input
    where
-    go 0 input = do setInput input; cont
-    go n input = case alexGetChar input of
+    go commentAcc 0 input = do setInput input
+                               b <- extension rawTokenStreamEnabled
+                               if b
+                                 then docCommentEnd input commentAcc ITblockComment _str span
+                                 else cont
+    go commentAcc n input = case alexGetChar input of
        Nothing -> errBrace input span
        Just ('-',input) -> case alexGetChar input of
          Nothing  -> errBrace input span
-        Just ('\125',input) -> go (n-1) input
-        Just (_,_)          -> go n input
+        Just ('\125',input) -> go commentAcc (n-1) input
+        Just (_,_)          -> go ('-':commentAcc) n input
        Just ('\123',input) -> case alexGetChar input of
          Nothing  -> errBrace input span
-        Just ('-',input) -> go (n+1) input
-        Just (_,_)       -> go n input
-      Just (_,input) -> go n input
+        Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
+        Just (_,_)       -> go ('\123':commentAcc) n input
+      Just (c,input) -> go (c:commentAcc) n input
  
  nested_doc_comment :: Action
  nested_doc_comment span buf _len = withLexedDocType (go "")
@@ -837,6 +899,8 @@ nested_doc_comment span buf _len = withLexedDocType (go "")
          Just (_,_) -> go ('\123':commentAcc) input docType False
        Just (c,input) -> go (c:commentAcc) input docType False
  
+withLexedDocType :: (AlexInput -> (String -> Token) -> Bool -> P (Located Token))
+                 -> P (Located Token)
  withLexedDocType lexDocComment = do
    input@(AI _ _ buf) <- getInput
    case prevChar buf ' ' of
@@ -845,12 +909,25 @@ withLexedDocType lexDocComment = do
      '$' -> lexDocComment input ITdocCommentNamed False
      '*' -> lexDocSection 1 input
      '#' -> lexDocComment input ITdocOptionsOld False
+    _ -> panic "withLexedDocType: Bad doc type"
   where 
      lexDocSection n input = case alexGetChar input of 
        Just ('*', input) -> lexDocSection (n+1) input
        Just (_,   _)     -> lexDocComment input (ITdocSection n) True
        Nothing -> do setInput input; lexToken -- eof reached, lex it normally
  
+-- RULES pragmas turn on the forall and '.' keywords, and we turn them
+-- off again at the end of the pragma.
+rulePrag :: Action
+rulePrag span _ _ = do
+  setExts (.|. bit inRulePragBit)
+  return (L span ITrules_prag)
+
+endPrag :: Action
+endPrag span _ _ = do
+  setExts (.&. complement (bit inRulePragBit))
+  return (L span ITclose_prag)
+
  -- docCommentEnd
  -------------------------------------------------------------------------------
  -- This function is quite tricky. We can't just return a new token, we also
@@ -883,8 +960,9 @@ docCommentEnd input commentAcc docType buf span = do
    span `seq` setLastToken span' last_len last_line_len
    return (L span' (docType comment))
   
+errBrace :: AlexInput -> SrcSpan -> P a
  errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
- 
+
  open_brace, close_brace :: Action
  open_brace span _str _len = do 
    ctx <- getContext
@@ -894,14 +972,15 @@ close_brace span _str _len = do
    popContext
    return (L span ITccurly)
  
-qvarid buf len = ITqvarid $! splitQualName buf len
-qconid buf len = ITqconid $! splitQualName buf len
+qvarid, qconid :: StringBuffer -> Int -> Token
+qvarid buf len = ITqvarid $! splitQualName buf len False
+qconid buf len = ITqconid $! splitQualName buf len False
  
-splitQualName :: StringBuffer -> Int -> (FastString,FastString)
+splitQualName :: StringBuffer -> Int -> Bool -> (FastString,FastString)
  -- takes a StringBuffer and a length, and returns the module name
  -- and identifier parts of a qualified name.  Splits at the *last* dot,
  -- because of hierarchical module names.
-splitQualName orig_buf len = split orig_buf orig_buf
+splitQualName orig_buf len parens = split orig_buf orig_buf
    where
      split buf dot_buf
         | orig_buf `byteDiff` buf >= len  = done dot_buf
@@ -921,11 +1000,14 @@ splitQualName orig_buf len = split orig_buf orig_buf
  
      done dot_buf =
         (lexemeToFastString orig_buf (qual_size - 1),
-        lexemeToFastString dot_buf (len - qual_size))
+        if parens -- Prelude.(+)
+            then lexemeToFastString (stepOn dot_buf) (len - qual_size - 2)
+            else lexemeToFastString dot_buf (len - qual_size))
        where
         qual_size = orig_buf `byteDiff` dot_buf
  
-varid span buf len = 
+varid :: Action
+varid span buf len =
    fs `seq`
    case lookupUFM reservedWordsFM fs of
         Just (keyword,0)    -> do
@@ -940,15 +1022,22 @@ varid span buf len =
    where
         fs = lexemeToFastString buf len
  
+conid :: StringBuffer -> Int -> Token
  conid buf len = ITconid fs
    where fs = lexemeToFastString buf len
  
-qvarsym buf len = ITqvarsym $! splitQualName buf len
-qconsym buf len = ITqconsym $! splitQualName buf len
+qvarsym, qconsym, prefixqvarsym, prefixqconsym :: StringBuffer -> Int -> Token
+qvarsym buf len = ITqvarsym $! splitQualName buf len False
+qconsym buf len = ITqconsym $! splitQualName buf len False
+prefixqvarsym buf len = ITprefixqvarsym $! splitQualName buf len True
+prefixqconsym buf len = ITprefixqconsym $! splitQualName buf len True
  
+varsym, consym :: Action
  varsym = sym ITvarsym
  consym = sym ITconsym
  
+sym :: (FastString -> Token) -> SrcSpan -> StringBuffer -> Int
+    -> P (Located Token)
  sym con span buf len = 
    case lookupUFM reservedSymsFM fs of
         Just (keyword,exts) -> do
@@ -970,16 +1059,27 @@ tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
       (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
  
  -- some conveniences for use with tok_integral
+tok_num :: (Integer -> Integer)
+        -> Int -> Int
+        -> (Integer, (Char->Int)) -> Action
  tok_num = tok_integral ITinteger
+tok_primint :: (Integer -> Integer)
+            -> Int -> Int
+            -> (Integer, (Char->Int)) -> Action
  tok_primint = tok_integral ITprimint
+tok_primword :: Int -> Int
+             -> (Integer, (Char->Int)) -> Action
  tok_primword = tok_integral ITprimword positive
+positive, negative :: (Integer -> Integer)
  positive = id
  negative = negate
+decimal, octal, hexadecimal :: (Integer, Char -> Int)
  decimal = (10,octDecDigit)
  octal = (8,octDecDigit)
  hexadecimal = (16,hexDigit)
  
  -- readRational can understand negative rationals, exponents, everything.
+tok_float, tok_primfloat, tok_primdouble :: String -> Token
  tok_float        str = ITrational   $! readRational str
  tok_primfloat    str = ITprimfloat  $! readRational str
  tok_primdouble   str = ITprimdouble $! readRational str
@@ -1007,6 +1107,7 @@ do_bol span _str _len = do
  
  -- certain keywords put us in the "layout" state, where we might
  -- add an opening curly brace.
+maybe_layout :: Token -> P ()
  maybe_layout ITdo      = pushLexState layout_do
  maybe_layout ITmdo     = pushLexState layout_do
  maybe_layout ITof      = pushLexState layout
@@ -1024,6 +1125,7 @@ maybe_layout _            = return ()
  -- by a 'do', then we allow the new context to be at the same indentation as
  -- the previous context.  This is what the 'strict' argument is for.
  --
+new_layout_context :: Bool -> Action
  new_layout_context strict span _buf _len = do
      popLexState
      (AI _ offset _) <- getInput
@@ -1040,6 +1142,7 @@ new_layout_context strict span _buf _len = do
                 setContext (Layout offset : ctx)
                 return (L span ITvocurly)
  
+do_layout_left :: Action
  do_layout_left span _buf _len = do
      popLexState
      pushLexState bol  -- we must be at the start of a line
@@ -1139,6 +1242,7 @@ lex_string s = do
         c' <- lex_char c i
         lex_string (c':s)
  
+lex_stringgap :: String -> P Token
  lex_stringgap s = do
    c <- getCharOrFail
    case c of
@@ -1181,11 +1285,11 @@ lex_char_tok span _buf _len = do        -- We've seen '
                 -- We've seen 'x, where x is a valid character
                 --  (i.e. not newline etc) but not a quote or backslash
            case alexGetChar' i2 of      -- Look ahead one more character
-               Nothing -> lit_error
                 Just ('\'', i3) -> do   -- We've seen 'x'
                         setInput i3 
                         finish_char_tok loc c
                 _other -> do            -- We've seen 'x not followed by quote
+                                       -- (including the possibility of EOF)
                                         -- If TH is on, just parse the quote only
                         th_exts <- extension thEnabled  
                         let (AI end _ _) = i1
@@ -1214,6 +1318,7 @@ lex_char c inp = do
        c | isAny c -> do setInput inp; return c
        _other -> lit_error
  
+isAny :: Char -> Bool
  isAny c | c > '\x7f' = isPrint c
         | otherwise  = is_any c
  
@@ -1267,6 +1372,7 @@ readNum is_digit base conv = do
         then readNum2 is_digit base conv (conv c)
         else do setInput i; lit_error
  
+readNum2 :: (Char -> Bool) -> Int -> (Char -> Int) -> Int -> P Char
  readNum2 is_digit base conv i = do
    input <- getInput
    read i input
@@ -1279,6 +1385,7 @@ readNum2 is_digit base conv i = do
                    then do setInput input; return (chr i)
                    else lit_error
  
+silly_escape_chars :: [(String, Char)]
  silly_escape_chars = [
         ("NUL", '\NUL'),
         ("SOH", '\SOH'),
@@ -1320,6 +1427,7 @@ silly_escape_chars = [
  -- the position of the error in the buffer.  This is so that we can report
  -- a correct location to the user, but also so we can detect UTF-8 decoding
  -- errors if they occur.
+lit_error :: P a
  lit_error = lexError "lexical error in string/character literal"
  
  getCharOrFail :: P Char
@@ -1373,6 +1481,11 @@ warn option warning srcspan _buf _len = do
      addWarning option srcspan warning
      lexToken
  
+warnThen :: DynFlag -> SDoc -> Action -> Action
+warnThen option warning action srcspan buf len = do
+    addWarning option srcspan warning
+    action srcspan buf len
+
  -- -----------------------------------------------------------------------------
  -- The Parse Monad
  
@@ -1391,14 +1504,14 @@ data ParseResult a
  
  data PState = PState { 
         buffer     :: StringBuffer,
-    dflags     :: DynFlags,
-    messages   :: Messages,
+        dflags     :: DynFlags,
+        messages   :: Messages,
          last_loc   :: SrcSpan, -- pos of previous token
          last_offs  :: !Int,    -- offset of the previous token from the
                                 -- beginning of  the current line.
                                 -- \t is equal to 8 spaces.
         last_len   :: !Int,     -- len of previous token
-  last_line_len :: !Int,
+        last_line_len :: !Int,
          loc        :: SrcLoc,   -- current loc (end of prev token + 1)
         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
         context    :: [LayoutContext],
@@ -1444,6 +1557,9 @@ extension p = P $ \s -> POk s (p $! extsBitmap s)
  getExts :: P Int
  getExts = P $ \s -> POk s (extsBitmap s)
  
+setExts :: (Int -> Int) -> P ()
+setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
+
  setSrcLoc :: SrcLoc -> P ()
  setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
  
@@ -1499,13 +1615,13 @@ alexGetChar (AI loc ofs s)
                   DecimalNumber         -> digit
                   LetterNumber          -> other_graphic
                   OtherNumber           -> other_graphic
-                 ConnectorPunctuation  -> other_graphic
-                 DashPunctuation       -> other_graphic
+                 ConnectorPunctuation  -> symbol
+                 DashPunctuation       -> symbol
                   OpenPunctuation       -> other_graphic
                   ClosePunctuation      -> other_graphic
                   InitialQuote          -> other_graphic
                   FinalQuote            -> other_graphic
-                 OtherPunctuation      -> other_graphic
+                 OtherPunctuation      -> symbol
                   MathSymbol            -> symbol
                   CurrencySymbol        -> symbol
                   ModifierSymbol        -> symbol
@@ -1549,47 +1665,90 @@ getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
  -- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
  -- integer
  
-genericsBit, ffiBit, parrBit :: Int
+genericsBit :: Int
  genericsBit = 0 -- {| and |}
+ffiBit :: Int
  ffiBit    = 1
+parrBit :: Int
  parrBit           = 2
+arrowsBit :: Int
  arrowsBit  = 4
+thBit :: Int
  thBit     = 5
+ipBit :: Int
  ipBit      = 6
+explicitForallBit :: Int
  explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
+bangPatBit :: Int
  bangPatBit = 8 -- Tells the parser to understand bang-patterns
                 -- (doesn't affect the lexer)
+tyFamBit :: Int
  tyFamBit   = 9 -- indexed type families: 'family' keyword and kind sigs
+haddockBit :: Int
  haddockBit = 10 -- Lex and parse Haddock comments
-magicHashBit = 11 -- # in both functions and operators
+magicHashBit :: Int
+magicHashBit = 11 -- "#" in both functions and operators
+kindSigsBit :: Int
  kindSigsBit = 12 -- Kind signatures on type variables
+recursiveDoBit :: Int
  recursiveDoBit = 13 -- mdo
+unicodeSyntaxBit :: Int
  unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
+unboxedTuplesBit :: Int
  unboxedTuplesBit = 15 -- (# and #)
+standaloneDerivingBit :: Int
  standaloneDerivingBit = 16 -- standalone instance deriving declarations
+transformComprehensionsBit :: Int
  transformComprehensionsBit = 17
+qqBit :: Int
  qqBit     = 18 -- enable quasiquoting
-
-genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
+inRulePragBit :: Int
+inRulePragBit = 19
+rawTokenStreamBit :: Int
+rawTokenStreamBit = 20 -- producing a token stream with all comments included
+newQualOpsBit :: Int
+newQualOpsBit = 21 -- Haskell' qualified operator syntax, e.g. Prelude.(+)
+
+always :: Int -> Bool
  always           _     = True
+genericsEnabled :: Int -> Bool
  genericsEnabled  flags = testBit flags genericsBit
-ffiEnabled       flags = testBit flags ffiBit
+parrEnabled :: Int -> Bool
  parrEnabled      flags = testBit flags parrBit
+arrowsEnabled :: Int -> Bool
  arrowsEnabled    flags = testBit flags arrowsBit
+thEnabled :: Int -> Bool
  thEnabled        flags = testBit flags thBit
+ipEnabled :: Int -> Bool
  ipEnabled        flags = testBit flags ipBit
+explicitForallEnabled :: Int -> Bool
  explicitForallEnabled flags = testBit flags explicitForallBit
+bangPatEnabled :: Int -> Bool
  bangPatEnabled   flags = testBit flags bangPatBit
-tyFamEnabled     flags = testBit flags tyFamBit
+-- tyFamEnabled :: Int -> Bool
+-- tyFamEnabled     flags = testBit flags tyFamBit
+haddockEnabled :: Int -> Bool
  haddockEnabled   flags = testBit flags haddockBit
+magicHashEnabled :: Int -> Bool
  magicHashEnabled flags = testBit flags magicHashBit
-kindSigsEnabled  flags = testBit flags kindSigsBit
-recursiveDoEnabled flags = testBit flags recursiveDoBit
+-- kindSigsEnabled :: Int -> Bool
+-- kindSigsEnabled  flags = testBit flags kindSigsBit
+unicodeSyntaxEnabled :: Int -> Bool
  unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
+unboxedTuplesEnabled :: Int -> Bool
  unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
+standaloneDerivingEnabled :: Int -> Bool
  standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
-transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
+qqEnabled :: Int -> Bool
  qqEnabled        flags = testBit flags qqBit
+-- inRulePrag :: Int -> Bool
+-- inRulePrag       flags = testBit flags inRulePragBit
+rawTokenStreamEnabled :: Int -> Bool
+rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
+newQualOps :: Int -> Bool
+newQualOps       flags = testBit flags newQualOpsBit
+oldQualOps :: Int -> Bool
+oldQualOps flags = not (newQualOps flags)
  
  -- PState for parsing options pragmas
  --
@@ -1637,6 +1796,7 @@ mkPState buf loc flags  =
                .|. qqBit        `setBitIf` dopt Opt_QuasiQuotes flags
                .|. ipBit        `setBitIf` dopt Opt_ImplicitParams flags
                .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
+              .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
                .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
                .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
                .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
@@ -1650,7 +1810,9 @@ mkPState buf loc flags  =
                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
-           .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
+               .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
+               .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
+               .|. newQualOpsBit `setBitIf` dopt Opt_NewQualifiedOperators flags
        --
        setBitIf :: Int -> Bool -> Int
        b `setBitIf` cond | cond      = bit b
@@ -1757,6 +1919,7 @@ lexToken = do
          span `seq` setLastToken span bytes bytes
          t span buf bytes
  
+reportLexError :: SrcLoc -> SrcLoc -> StringBuffer -> [Char] -> P a
  reportLexError loc1 loc2 buf str
    | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
    | otherwise =
@@ -1766,4 +1929,13 @@ reportLexError loc1 loc2 buf str
    if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
      then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
      else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
+
+lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
+lexTokenStream buf loc dflags = unP go initState
+    where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
+          go = do
+            ltok <- lexer return
+            case ltok of
+              L _ ITeof -> return []
+              _ -> liftM (ltok:) go
  }