parseInteger->parseUnsignedInteger to clarify meaning

[ghc-hetmet.git] / compiler / parser / Lexer.x
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x

index aed9cfb..de025de 100644 (file)
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -25,14 +25,16 @@ module Lexer (
     Token(..), lexer, pragState, mkPState, PState(..),
     P(..), ParseResult(..), getSrcLoc, 
     failLocMsgP, failSpanMsgP, srcParseFail,
+   getMessages,
     popContext, pushCurrentContext, setLastToken, setSrcLoc,
     getLexState, popLexState, pushLexState,
-   extension, bangPatEnabled
+   extension, glaExtsEnabled, bangPatEnabled
    ) where
  
  #include "HsVersions.h"
  
-import ErrUtils                ( Message )
+import Bag
+import ErrUtils
  import Outputable
  import StringBuffer
  import FastString
@@ -43,10 +45,11 @@ import DynFlags
  import Ctype
  import Util            ( maybePrefixMatch, readRational )
  
-import DATA_BITS
-import Data.Char       ( chr )
-import Ratio
---import TRACE
+import Control.Monad
+import Data.Bits
+import Data.Char       ( chr, isSpace )
+import Data.Ratio
+import Debug.Trace
  
  #if __GLASGOW_HASKELL__ >= 605
  import Data.Char       ( GeneralCategory(..), generalCategory, isPrint, isUpper )
@@ -55,29 +58,30 @@ import Compat.Unicode       ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  #endif
  }
  
-$unispace    = \x05
-$whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
+$unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
+$whitechar   = [\ \n\r\f\v\xa0 $unispace]
  $white_no_nl = $whitechar # \n
+$tab         = \t
  
  $ascdigit  = 0-9
-$unidigit  = \x03
+$unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
  $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  $digit     = [$ascdigit $unidigit]
  
  $special   = [\(\)\,\;\[\]\`\{\}]
  $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
-$unisymbol = \x04
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
  $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  
-$unilarge  = \x01
+$unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
  $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  $large     = [$asclarge $unilarge]
  
-$unismall  = \x02
+$unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
  $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  $small     = [$ascsmall $unismall \_]
  
-$unigraphic = \x06
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
  $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  
  $octit    = 0-7
@@ -86,6 +90,8 @@ $symchar   = [$symbol \:]
  $nl        = [\n\r]
  $idchar    = [$small $large $digit \']
  
+$docsym    = [\| \^ \* \$]
+
  @varid     = $small $idchar*
  @conid     = $large $idchar*
  
@@ -106,21 +112,54 @@ haskell :-
  
  -- everywhere: skip whitespace and comments
  $white_no_nl+                          ;
+$tab+         { warn Opt_WarnTabs (text "Tab character") }
  
  -- Everywhere: deal with nested comments.  We explicitly rule out
  -- pragmas, "{-#", so that we don't accidentally treat them as comments.
  -- (this can happen even though pragmas will normally take precedence due to
  -- longest-match, because pragmas aren't valid in every state, but comments
--- are).
-"{-" / { notFollowedBy '#' }           { nested_comment }
+-- are). We also rule out nested Haddock comments, if the -haddock flag is
+-- set.
+
+"{-" / { isNormalComment } { nested_comment lexToken }
  
  -- Single-line comments are a bit tricky.  Haskell 98 says that two or
  -- more dashes followed by a symbol should be parsed as a varsym, so we
  -- have to exclude those.
--- The regex says: "munch all the characters after the dashes, as long as
--- the first one is not a symbol".
-"--"\-* [^$symbol :] .*                        ;
-"--"\-* / { atEOL }                    ;
+
+-- Since Haddock comments aren't valid in every state, we need to rule them
+-- out here.  
+
+-- The following two rules match comments that begin with two dashes, but
+-- continue with a different character. The rules test that this character
+-- is not a symbol (in which case we'd have a varsym), and that it's not a
+-- space followed by a Haddock comment symbol (docsym) (in which case we'd
+-- have a Haddock comment). The rules then munch the rest of the line.
+
+"-- " ~$docsym .* ;
+"--" [^$symbol : \ ] .* ;
+
+-- Next, match Haddock comments if no -haddock flag
+
+"-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
+
+-- Now, when we've matched comments that begin with 2 dashes and continue
+-- with a different character, we need to match comments that begin with three
+-- or more dashes (which clearly can't be Haddock comments). We only need to
+-- make sure that the first non-dash character isn't a symbol, and munch the
+-- rest of the line.
+
+"---"\-* [^$symbol :] .* ;
+
+-- Since the previous rules all match dashes followed by at least one
+-- character, we also need to match a whole line filled with just dashes.
+
+"--"\-* / { atEOL } ;
+
+-- We need this rule since none of the other single line comment rules
+-- actually match this case.
+
+"-- " / { atEOL } ;
  
  -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
  -- blank lines) until we find a non-whitespace character, then do layout
@@ -199,10 +238,15 @@ $white_no_nl+                             ;
    "{-#" $whitechar* (DEPRECATED|deprecated)
                                         { token ITdeprecated_prag }
    "{-#" $whitechar* (SCC|scc)          { token ITscc_prag }
+  "{-#" $whitechar* (GENERATED|generated)
+                                       { token ITgenerated_prag }
    "{-#" $whitechar* (CORE|core)                { token ITcore_prag }
    "{-#" $whitechar* (UNPACK|unpack)    { token ITunpack_prag }
  
-  "{-#"                                { nested_comment }
+  "{-#" $whitechar* (DOCOPTIONS|docoptions)
+  / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
+
+ "{-#"                                 { nested_comment lexToken }
  
    -- ToDo: should only be valid inside a pragma:
    "#-}"                                { token ITclose_prag}
@@ -218,12 +262,19 @@ $white_no_nl+                             ;
  
  <0,option_prags,glaexts> {
         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ... 
-  "{-#" $whitechar* $idchar+            { nested_comment }
+  "{-#" $whitechar* $idchar+           { nested_comment lexToken }
  }
  
  -- '0' state: ordinary lexemes
  -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
  
+-- Haddock comments
+
+<0,glaexts> {
+  "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
+  "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
+}
+
  -- "special" symbols
  
  <0,glaexts> {
@@ -250,7 +301,6 @@ $white_no_nl+                               ;
  
  <0,glaexts> {
    \? @varid / { ifExtension ipEnabled }        { skip_one_varid ITdupipvarid }
-  \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
  }
  
  <glaexts> {
@@ -329,11 +379,6 @@ $white_no_nl+                              ;
  }
  
  {
--- work around bug in Alex 2.0
-#if __GLASGOW_HASKELL__ < 503
-unsafeAt arr i = arr ! i
-#endif
-
  -- -----------------------------------------------------------------------------
  -- The token type
  
@@ -344,6 +389,7 @@ data Token
    | ITdata
    | ITdefault
    | ITderiving
+  | ITderive
    | ITdo
    | ITelse
    | IThiding
@@ -376,7 +422,7 @@ data Token
    | ITccallconv
    | ITdotnet
    | ITmdo
-  | ITiso
+  | ITfamily
  
         -- Pragmas
    | ITinline_prag Bool         -- True <=> INLINE, False <=> NOINLINE
@@ -387,6 +433,7 @@ data Token
    | ITdeprecated_prag
    | ITline_prag
    | ITscc_prag
+  | ITgenerated_prag
    | ITcore_prag                 -- hdaume: core annotations
    | ITunpack_prag
    | ITclose_prag
@@ -441,7 +488,6 @@ data Token
    | ITqconsym (FastString,FastString)
  
    | ITdupipvarid   FastString  -- GHC extension: implicit param: ?x
-  | ITsplitipvarid FastString  -- GHC extension: implicit param: %x
  
    | ITpragma StringBuffer
  
@@ -479,6 +525,14 @@ data Token
  
    | ITunknown String           -- Used when the lexer can't make sense of it
    | ITeof                      -- end of file token
+
+  -- Documentation annotations
+  | ITdocCommentNext  String     -- something beginning '-- |'
+  | ITdocCommentPrev  String     -- something beginning '-- ^'
+  | ITdocCommentNamed String     -- something beginning '-- $'
+  | ITdocSection      Int String -- a section heading
+  | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
+
  #ifdef DEBUG
    deriving Show -- debugging
  #endif
@@ -489,6 +543,7 @@ isSpecial :: Token -> Bool
  -- not as a keyword.
  isSpecial ITas         = True
  isSpecial IThiding     = True
+isSpecial ITderive     = True
  isSpecial ITqualified  = True
  isSpecial ITforall     = True
  isSpecial ITexport     = True
@@ -500,7 +555,7 @@ isSpecial ITunsafe          = True
  isSpecial ITccallconv   = True
  isSpecial ITstdcallconv = True
  isSpecial ITmdo                = True
-isSpecial ITiso                = True
+isSpecial ITfamily     = True
  isSpecial _             = False
  
  -- the bitmap provided as the third component indicates whether the
@@ -519,6 +574,7 @@ reservedWordsFM = listToUFM $
         ( "data",       ITdata,         0 ),     
         ( "default",    ITdefault,      0 ),  
         ( "deriving",   ITderiving,     0 ), 
+       ( "derive",     ITderive,       0 ), 
         ( "do",         ITdo,           0 ),       
         ( "else",       ITelse,         0 ),     
         ( "hiding",     IThiding,       0 ),
@@ -541,7 +597,7 @@ reservedWordsFM = listToUFM $
  
         ( "forall",     ITforall,        bit tvBit),
         ( "mdo",        ITmdo,           bit glaExtsBit),
-       ( "iso",        ITiso,           bit glaExtsBit),
+       ( "family",     ITfamily,        bit tyFamBit),
  
         ( "foreign",    ITforeign,       bit ffiBit),
         ( "export",     ITexport,        bit ffiBit),
@@ -575,8 +631,9 @@ reservedSymsFM = listToUFM $
         ,("-",  ITminus,        0)
         ,("!",  ITbang,         0)
  
-       ,("*",  ITstar,         bit glaExtsBit) -- For data T (a::*) = MkT
-       ,(".",  ITdot,          bit tvBit)      -- For 'forall a . t'
+       ,("*",  ITstar,         bit glaExtsBit .|. 
+                               bit tyFamBit)       -- For data T (a::*) = MkT
+       ,(".",  ITdot,          bit tvBit)          -- For 'forall a . t'
  
         ,("-<", ITlarrowtail,   bit arrowsBit)
         ,(">-", ITrarrowtail,   bit arrowsBit)
@@ -584,7 +641,6 @@ reservedSymsFM = listToUFM $
         ,(">>-",        ITRarrowtail,   bit arrowsBit)
  
  #if __GLASGOW_HASKELL__ >= 605
-       ,("λ", ITlam,          bit glaExtsBit)
         ,("∷",   ITdcolon,       bit glaExtsBit)
         ,("⇒",   ITdarrow,    bit glaExtsBit)
         ,("∀",        ITforall,       bit glaExtsBit)
@@ -634,43 +690,152 @@ pop _span _buf _len = do popLexState; lexToken
  pop_and :: Action -> Action
  pop_and act span buf len = do popLexState; act span buf len
  
-notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
+{-# INLINE nextCharIs #-}
+nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
+
+notFollowedBy char _ _ _ (AI _ _ buf) 
+  = nextCharIs buf (/=char)
  
  notFollowedBySymbol _ _ _ (AI _ _ buf)
-  = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
+  = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
+
+-- We must reject doc comments as being ordinary comments everywhere.
+-- In some cases the doc comment will be selected as the lexeme due to
+-- maximal munch, but not always, because the nested comment rule is
+-- valid in all states, but the doc-comment rules are only valid in
+-- the non-layout states.
+isNormalComment bits _ _ (AI _ _ buf)
+  | haddockEnabled bits = notFollowedByDocOrPragma
+  | otherwise           = nextCharIs buf (/='#')
+  where
+    notFollowedByDocOrPragma
+       = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
+
+spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
+
+haddockDisabledAnd p bits _ _ (AI _ _ buf)
+  = if haddockEnabled bits then False else (p buf)
  
  atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
  
  ifExtension pred bits _ _ _ = pred bits
  
+multiline_doc_comment :: Action
+multiline_doc_comment span buf _len = withLexedDocType (worker "")
+  where
+    worker commentAcc input docType oneLine = case alexGetChar input of
+      Just ('\n', input') 
+        | oneLine -> docCommentEnd input commentAcc docType buf span
+        | otherwise -> case checkIfCommentLine input' of
+          Just input -> worker ('\n':commentAcc) input docType False
+          Nothing -> docCommentEnd input commentAcc docType buf span
+      Just (c, input) -> worker (c:commentAcc) input docType oneLine
+      Nothing -> docCommentEnd input commentAcc docType buf span
+      
+    checkIfCommentLine input = check (dropNonNewlineSpace input)
+      where
+        check input = case alexGetChar input of
+          Just ('-', input) -> case alexGetChar input of
+            Just ('-', input) -> case alexGetChar input of
+              Just (c, _) | c /= '-' -> Just input
+              _ -> Nothing
+            _ -> Nothing
+          _ -> Nothing
+
+        dropNonNewlineSpace input = case alexGetChar input of
+          Just (c, input') 
+            | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
+            | otherwise -> input
+          Nothing -> input
+
  {-
    nested comments require traversing by hand, they can't be parsed
    using regular expressions.
  -}
-nested_comment :: Action
-nested_comment span _str _len = do
+nested_comment :: P (Located Token) -> Action
+nested_comment cont span _str _len = do
    input <- getInput
    go 1 input
-  where go 0 input = do setInput input; lexToken
-       go n input = do
-         case alexGetChar input of
-           Nothing  -> err input
-           Just (c,input) -> do
-             case c of
-               '-' -> do
-                 case alexGetChar input of
-                   Nothing  -> err input
-                   Just ('\125',input) -> go (n-1) input
-                   Just (c,_)          -> go n input
-               '\123' -> do
-                 case alexGetChar input of
-                   Nothing  -> err input
-                   Just ('-',input') -> go (n+1) input'
-                   Just (c,input)    -> go n input
-               c -> go n input
-
-        err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
-
+  where
+    go 0 input = do setInput input; cont
+    go n input = case alexGetChar input of
+      Nothing -> errBrace input span
+      Just ('-',input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('\125',input) -> go (n-1) input
+        Just (c,_)          -> go n input
+      Just ('\123',input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('-',input) -> go (n+1) input
+        Just (c,_)       -> go n input
+      Just (c,input) -> go n input
+
+nested_doc_comment :: Action
+nested_doc_comment span buf _len = withLexedDocType (go "")
+  where
+    go commentAcc input docType _ = case alexGetChar input of
+      Nothing -> errBrace input span
+      Just ('-',input) -> case alexGetChar input of
+        Nothing -> errBrace input span
+        Just ('\125',input@(AI end _ buf2)) ->
+          docCommentEnd input commentAcc docType buf span
+        Just (c,_) -> go ('-':commentAcc) input docType False
+      Just ('\123', input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('-',input) -> do
+          setInput input
+          let cont = do input <- getInput; go commentAcc input docType False
+          nested_comment cont span buf _len
+        Just (c,_) -> go ('\123':commentAcc) input docType False
+      Just (c,input) -> go (c:commentAcc) input docType False
+
+withLexedDocType lexDocComment = do
+  input@(AI _ _ buf) <- getInput
+  case prevChar buf ' ' of
+    '|' -> lexDocComment input ITdocCommentNext False
+    '^' -> lexDocComment input ITdocCommentPrev False
+    '$' -> lexDocComment input ITdocCommentNamed False
+    '*' -> lexDocSection 1 input 
+ where 
+    lexDocSection n input = case alexGetChar input of 
+      Just ('*', input) -> lexDocSection (n+1) input
+      Just (c, _) -> lexDocComment input (ITdocSection n) True
+      Nothing -> do setInput input; lexToken -- eof reached, lex it normally
+
+-- docCommentEnd
+-------------------------------------------------------------------------------
+-- This function is quite tricky. We can't just return a new token, we also
+-- need to update the state of the parser. Why? Because the token is longer
+-- than what was lexed by Alex, and the lexToken function doesn't know this, so 
+-- it writes the wrong token length to the parser state. This function is
+-- called afterwards, so it can just update the state. 
+
+-- This is complicated by the fact that Haddock tokens can span multiple lines, 
+-- which is something that the original lexer didn't account for. 
+-- I have added last_line_len in the parser state which represents the length 
+-- of the part of the token that is on the last line. It is now used for layout 
+-- calculation in pushCurrentContext instead of last_len. last_len is, like it 
+-- was before, the full length of the token, and it is now only used for error
+-- messages. /Waern 
+
+docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
+                 SrcSpan -> P (Located Token) 
+docCommentEnd input commentAcc docType buf span = do
+  setInput input
+  let (AI loc last_offs nextBuf) = input
+      comment = reverse commentAcc
+      span' = mkSrcSpan (srcSpanStart span) loc
+      last_len = byteDiff buf nextBuf
+      
+      last_line_len = if (last_offs - last_len < 0) 
+        then last_offs
+        else last_len  
+  
+  span `seq` setLastToken span' last_len last_line_len
+  return (L span' (docType comment))
+ 
+errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
+ 
  open_brace, close_brace :: Action
  open_brace span _str _len = do 
    ctx <- getContext
@@ -770,22 +935,22 @@ sym con span buf len =
         fs = lexemeToFastString buf len
  
  tok_decimal span buf len 
-  = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
+  = return (L span (ITinteger  $! parseUnsignedInteger buf len 10 octDecDigit))
  
  tok_octal span buf len 
-  = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
+  = return (L span (ITinteger  $! parseUnsignedInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
  
  tok_hexadecimal span buf len 
-  = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
+  = return (L span (ITinteger  $! parseUnsignedInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
  
  prim_decimal span buf len 
-  = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
+  = return (L span (ITprimint  $! parseUnsignedInteger buf (len-1) 10 octDecDigit))
  
  prim_octal span buf len 
-  = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
+  = return (L span (ITprimint  $! parseUnsignedInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
  
  prim_hexadecimal span buf len 
-  = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
+  = return (L span (ITprimint  $! parseUnsignedInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
  
  tok_float        str = ITrational   $! readRational str
  prim_float       str = ITprimfloat  $! readRational str
@@ -857,7 +1022,7 @@ do_layout_left span _buf _len = do
  
  setLine :: Int -> Action
  setLine code span buf len = do
-  let line = parseInteger buf len 10 octDecDigit
+  let line = parseUnsignedInteger buf len 10 octDecDigit
    setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
         -- subtract one: the line number refers to the *following* line
    popLexState
@@ -1137,11 +1302,20 @@ getCharOrFail =  do
         Just (c,i)  -> do setInput i; return c
  
  -- -----------------------------------------------------------------------------
+-- Warnings
+
+warn :: DynFlag -> SDoc -> Action
+warn option warning span _buf _len = do
+    addWarning option (mkWarnMsg span alwaysQualify warning)
+    lexToken
+
+-- -----------------------------------------------------------------------------
  -- The Parse Monad
  
  data LayoutContext
    = NoLayout
    | Layout !Int
+  deriving Show
  
  data ParseResult a
    = POk PState a
@@ -1153,11 +1327,14 @@ data ParseResult a
  
  data PState = PState { 
         buffer     :: StringBuffer,
+    dflags     :: DynFlags,
+    messages   :: Messages,
          last_loc   :: SrcSpan, -- pos of previous token
          last_offs  :: !Int,    -- offset of the previous token from the
                                 -- beginning of  the current line.
                                 -- \t is equal to 8 spaces.
         last_len   :: !Int,     -- len of previous token
+  last_line_len :: !Int,
          loc        :: SrcLoc,   -- current loc (end of prev token + 1)
         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
         context    :: [LayoutContext],
@@ -1209,8 +1386,12 @@ setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
  getSrcLoc :: P SrcLoc
  getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
  
-setLastToken :: SrcSpan -> Int -> P ()
-setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
+setLastToken :: SrcSpan -> Int -> Int -> P ()
+setLastToken loc len line_len = P $ \s -> POk s { 
+  last_loc=loc, 
+  last_len=len,
+  last_line_len=line_len 
+} ()
  
  data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
  
@@ -1238,6 +1419,9 @@ alexGetChar (AI loc ofs s)
         adj_c 
           | c <= '\x06' = non_graphic
           | c <= '\xff' = c
+          -- Alex doesn't handle Unicode, so when Unicode
+          -- character is encoutered we output these values
+          -- with the actual character value hidden in the state.
           | otherwise = 
                 case generalCategory c of
                   UppercaseLetter       -> upper
@@ -1311,6 +1495,8 @@ ipBit      = 6
  tvBit     = 7  -- Scoped type variables enables 'forall' keyword
  bangPatBit = 8 -- Tells the parser to understand bang-patterns
                 -- (doesn't affect the lexer)
+tyFamBit   = 9 -- indexed type families: 'family' keyword and kind sigs
+haddockBit = 10 -- Lex and parse Haddock comments
  
  glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
  glaExtsEnabled flags = testBit flags glaExtsBit
@@ -1321,20 +1507,27 @@ thEnabled      flags = testBit flags thBit
  ipEnabled      flags = testBit flags ipBit
  tvEnabled      flags = testBit flags tvBit
  bangPatEnabled flags = testBit flags bangPatBit
+tyFamEnabled   flags = testBit flags tyFamBit
+haddockEnabled flags = testBit flags haddockBit
  
  -- PState for parsing options pragmas
  --
  pragState :: StringBuffer -> SrcLoc -> PState
  pragState buf loc  = 
    PState {
-      buffer    = buf,
-      last_loc   = mkSrcSpan loc loc,
-      last_offs  = 0,
-      last_len   = 0,
-      loc        = loc,
-      extsBitmap = 0,
-      context    = [],
-      lex_state  = [bol, option_prags, 0]
+      buffer         = buf,
+      messages      = emptyMessages,
+      -- XXX defaultDynFlags is not right, but we don't have a real
+      -- dflags handy
+      dflags        = defaultDynFlags,
+      last_loc      = mkSrcSpan loc loc,
+      last_offs     = 0,
+      last_len      = 0,
+      last_line_len = 0,
+      loc           = loc,
+      extsBitmap    = 0,
+      context       = [],
+      lex_state     = [bol, option_prags, 0]
      }
  
  
@@ -1343,30 +1536,44 @@ pragState buf loc  =
  mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
  mkPState buf loc flags  = 
    PState {
-      buffer    = buf,
-      last_loc   = mkSrcSpan loc loc,
-      last_offs  = 0,
-      last_len   = 0,
-      loc        = loc,
-      extsBitmap = fromIntegral bitmap,
-      context    = [],
-      lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
+      buffer         = buf,
+      dflags        = flags,
+      messages      = emptyMessages,
+      last_loc      = mkSrcSpan loc loc,
+      last_offs     = 0,
+      last_len      = 0,
+      last_line_len = 0,
+      loc           = loc,
+      extsBitmap    = fromIntegral bitmap,
+      context       = [],
+      lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
         -- we begin in the layout state if toplev_layout is set
      }
      where
-      bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
-              .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
-              .|. parrBit    `setBitIf` dopt Opt_PArr        flags
-              .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
-              .|. thBit      `setBitIf` dopt Opt_TH          flags
+      bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts  flags
+              .|. ffiBit     `setBitIf` dopt Opt_FFI          flags
+              .|. parrBit    `setBitIf` dopt Opt_PArr         flags
+              .|. arrowsBit  `setBitIf` dopt Opt_Arrows       flags
+              .|. thBit      `setBitIf` dopt Opt_TH           flags
                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
+              .|. tyFamBit   `setBitIf` dopt Opt_TypeFamilies flags
+              .|. haddockBit `setBitIf` dopt Opt_Haddock      flags
        --
        setBitIf :: Int -> Bool -> Int
        b `setBitIf` cond | cond      = bit b
                         | otherwise = 0
  
+addWarning :: DynFlag -> WarnMsg -> P ()
+addWarning option w
+ = P $ \s@PState{messages=(ws,es), dflags=d} ->
+       let ws' = if dopt option d then ws `snocBag` w else ws
+       in POk s{messages=(ws', es)} ()
+
+getMessages :: PState -> Messages
+getMessages PState{messages=ms} = ms
+
  getContext :: P [LayoutContext]
  getContext = P $ \s@PState{context=ctx} -> POk s ctx
  
@@ -1384,8 +1591,9 @@ popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
  -- This is only used at the outer level of a module when the 'module'
  -- keyword is missing.
  pushCurrentContext :: P ()
-pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
-  POk s{context = Layout (offs-len) : ctx} ()
+pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } -> 
+    POk s{context = Layout (offs-len) : ctx} ()
+--trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
  
  getOffside :: P Ordering
  getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
@@ -1431,8 +1639,8 @@ lexError str = do
  
  lexer :: (Located Token -> P a) -> P a
  lexer cont = do
-  tok@(L _ tok__) <- lexToken
-  --trace ("token: " ++ show tok__) $ do
+  tok@(L span tok__) <- lexToken
+--  trace ("token: " ++ show tok__) $ do
    cont tok
  
  lexToken :: P (Located Token)
@@ -1442,7 +1650,7 @@ lexToken = do
    exts <- getExts
    case alexScanUser exts inp sc of
      AlexEOF -> do let span = mkSrcSpan loc1 loc1
-                 setLastToken span 0
+                 setLastToken span 0 0
                   return (L span ITeof)
      AlexError (AI loc2 _ buf) -> do 
         reportLexError loc1 loc2 buf "lexical error"
@@ -1450,11 +1658,11 @@ lexToken = do
         setInput inp2
         lexToken
      AlexToken inp2@(AI end _ buf2) len t -> do
-       setInput inp2
-       let span = mkSrcSpan loc1 end
-       let bytes = byteDiff buf buf2
-       span `seq` setLastToken span bytes
-       t span buf bytes
+    setInput inp2
+    let span = mkSrcSpan loc1 end
+    let bytes = byteDiff buf buf2
+    span `seq` setLastToken span bytes bytes
+    t span buf bytes
  
  reportLexError loc1 loc2 buf str
    | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")