X-Git-Url: http://git.megacz.com/?p=ghc-hetmet.git;a=blobdiff_plain;f=compiler%2Fparser%2FLexer.x;h=316f21f1e9a9368082e73207a421aad51703f437;hp=aed9cfb9655d68ce3a42cc291ee97af6b3693566;hb=84923cc7de2a93c22a2f72daf9ac863959efae13;hpb=72264dbcb05c7045dff28aa88b55634fa6c1ddf0

diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index aed9cfb..316f21f 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -27,7 +27,7 @@ module Lexer (
    failLocMsgP, failSpanMsgP, srcParseFail,
    popContext, pushCurrentContext, setLastToken, setSrcLoc,
    getLexState, popLexState, pushLexState,
-   extension, bangPatEnabled
+   extension, glaExtsEnabled, bangPatEnabled
   ) where
 
 #include "HsVersions.h"
@@ -43,10 +43,10 @@ import DynFlags
 import Ctype
 import Util		( maybePrefixMatch, readRational )
 
-import DATA_BITS
-import Data.Char 	( chr )
-import Ratio
---import TRACE
+import Data.Bits
+import Data.Char 	( chr, isSpace )
+import Data.Ratio
+import Debug.Trace
 
 #if __GLASGOW_HASKELL__ >= 605
 import Data.Char 	( GeneralCategory(..), generalCategory, isPrint, isUpper )
@@ -86,6 +86,8 @@ $symchar   = [$symbol \:]
 $nl        = [\n\r]
 $idchar    = [$small $large $digit \']
 
+$docsym    = [\| \^ \* \$]
+
 @varid     = $small $idchar*
 @conid     = $large $idchar*
 
@@ -111,16 +113,48 @@ $white_no_nl+ 				;
 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 -- (this can happen even though pragmas will normally take precedence due to
 -- longest-match, because pragmas aren't valid in every state, but comments
--- are).
-"{-" / { notFollowedBy '#' }		{ nested_comment }
+-- are). We also rule out nested Haddock comments, if the -haddock flag is
+-- set.
+
+"{-" / { isNormalComment } { nested_comment lexToken }
 
 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 -- more dashes followed by a symbol should be parsed as a varsym, so we
 -- have to exclude those.
--- The regex says: "munch all the characters after the dashes, as long as
--- the first one is not a symbol".
-"--"\-* [^$symbol :] .*			;
-"--"\-* / { atEOL }			;
+
+-- Since Haddock comments aren't valid in every state, we need to rule them
+-- out here.  
+
+-- The following two rules match comments that begin with two dashes, but
+-- continue with a different character. The rules test that this character
+-- is not a symbol (in which case we'd have a varsym), and that it's not a
+-- space followed by a Haddock comment symbol (docsym) (in which case we'd
+-- have a Haddock comment). The rules then munch the rest of the line.
+
+"-- " ~$docsym .* ;
+"--" [^$symbol : \ ] .* ;
+
+-- Next, match Haddock comments if no -haddock flag
+
+"-- " $docsym .* / { ifExtension (not . haddockEnabled) } ;
+
+-- Now, when we've matched comments that begin with 2 dashes and continue
+-- with a different character, we need to match comments that begin with three
+-- or more dashes (which clearly can't be Haddock comments). We only need to
+-- make sure that the first non-dash character isn't a symbol, and munch the
+-- rest of the line.
+
+"---"\-* [^$symbol :] .* ;
+
+-- Since the previous rules all match dashes followed by at least one
+-- character, we also need to match a whole line filled with just dashes.
+
+"--"\-* / { atEOL } ;
+
+-- We need this rule since none of the other single line comment rules
+-- actually match this case.
+
+"-- " / { atEOL } ;
 
 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 -- blank lines) until we find a non-whitespace character, then do layout
@@ -199,10 +233,15 @@ $white_no_nl+ 				;
   "{-#" $whitechar* (DEPRECATED|deprecated)
   					{ token ITdeprecated_prag }
   "{-#" $whitechar* (SCC|scc)		{ token ITscc_prag }
+  "{-#" $whitechar* (GENERATED|generated)
+  					{ token ITgenerated_prag }
   "{-#" $whitechar* (CORE|core)		{ token ITcore_prag }
   "{-#" $whitechar* (UNPACK|unpack)	{ token ITunpack_prag }
 
-  "{-#" 				{ nested_comment }
+  "{-#" $whitechar* (DOCOPTIONS|docoptions)
+  / { ifExtension haddockEnabled }     { lex_string_prag ITdocOptions }
+
+ "{-#"                                 { nested_comment lexToken }
 
   -- ToDo: should only be valid inside a pragma:
   "#-}" 				{ token ITclose_prag}
@@ -218,12 +257,19 @@ $white_no_nl+ 				;
 
 <0,option_prags,glaexts> {
 	-- This is to catch things like {-# OPTIONS OPTIONS_HUGS ... 
-  "{-#" $whitechar* $idchar+            { nested_comment }
+  "{-#" $whitechar* $idchar+		{ nested_comment lexToken }
 }
 
 -- '0' state: ordinary lexemes
 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 
+-- Haddock comments
+
+<0,glaexts> {
+  "-- " / $docsym    { multiline_doc_comment }
+  "{-" \ ? / $docsym { nested_doc_comment }
+}
+
 -- "special" symbols
 
 <0,glaexts> {
@@ -250,7 +296,6 @@ $white_no_nl+ 				;
 
 <0,glaexts> {
   \? @varid / { ifExtension ipEnabled }	{ skip_one_varid ITdupipvarid }
-  \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
 }
 
 <glaexts> {
@@ -344,6 +389,7 @@ data Token
   | ITdata
   | ITdefault
   | ITderiving
+  | ITderived
   | ITdo
   | ITelse
   | IThiding
@@ -377,6 +423,7 @@ data Token
   | ITdotnet
   | ITmdo
   | ITiso
+  | ITfamily
 
 	-- Pragmas
   | ITinline_prag Bool		-- True <=> INLINE, False <=> NOINLINE
@@ -387,6 +434,7 @@ data Token
   | ITdeprecated_prag
   | ITline_prag
   | ITscc_prag
+  | ITgenerated_prag
   | ITcore_prag                 -- hdaume: core annotations
   | ITunpack_prag
   | ITclose_prag
@@ -441,7 +489,6 @@ data Token
   | ITqconsym (FastString,FastString)
 
   | ITdupipvarid   FastString	-- GHC extension: implicit param: ?x
-  | ITsplitipvarid FastString	-- GHC extension: implicit param: %x
 
   | ITpragma StringBuffer
 
@@ -479,6 +526,14 @@ data Token
 
   | ITunknown String		-- Used when the lexer can't make sense of it
   | ITeof			-- end of file token
+
+  -- Documentation annotations
+  | ITdocCommentNext  String     -- something beginning '-- |'
+  | ITdocCommentPrev  String     -- something beginning '-- ^'
+  | ITdocCommentNamed String     -- something beginning '-- $'
+  | ITdocSection      Int String -- a section heading
+  | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
+
 #ifdef DEBUG
   deriving Show -- debugging
 #endif
@@ -489,6 +544,7 @@ isSpecial :: Token -> Bool
 -- not as a keyword.
 isSpecial ITas        	= True
 isSpecial IThiding    	= True
+isSpecial ITderived    	= True
 isSpecial ITqualified 	= True
 isSpecial ITforall    	= True
 isSpecial ITexport    	= True
@@ -501,6 +557,7 @@ isSpecial ITccallconv   = True
 isSpecial ITstdcallconv = True
 isSpecial ITmdo		= True
 isSpecial ITiso		= True
+isSpecial ITfamily	= True
 isSpecial _             = False
 
 -- the bitmap provided as the third component indicates whether the
@@ -519,6 +576,7 @@ reservedWordsFM = listToUFM $
 	( "data",	ITdata, 	0 ),     
 	( "default",	ITdefault, 	0 ),  
 	( "deriving",	ITderiving, 	0 ), 
+	( "derived",	ITderived, 	0 ), 
 	( "do",		ITdo, 		0 ),       
 	( "else",	ITelse, 	0 ),     
 	( "hiding",	IThiding, 	0 ),
@@ -541,7 +599,7 @@ reservedWordsFM = listToUFM $
 
       	( "forall",	ITforall,	 bit tvBit),
 	( "mdo",	ITmdo,		 bit glaExtsBit),
-	( "iso",	ITiso,		 bit glaExtsBit),
+	( "family",	ITfamily,	 bit idxTysBit),
 
 	( "foreign",	ITforeign,	 bit ffiBit),
 	( "export",	ITexport,	 bit ffiBit),
@@ -575,8 +633,9 @@ reservedSymsFM = listToUFM $
        ,("-",	ITminus, 	0)
        ,("!",	ITbang, 	0)
 
-       ,("*",	ITstar,		bit glaExtsBit)	-- For data T (a::*) = MkT
-       ,(".",	ITdot,		bit tvBit)	-- For 'forall a . t'
+       ,("*",	ITstar,		bit glaExtsBit .|. 
+				bit idxTysBit)	    -- For data T (a::*) = MkT
+       ,(".",	ITdot,		bit tvBit)	    -- For 'forall a . t'
 
        ,("-<",	ITlarrowtail,	bit arrowsBit)
        ,(">-",	ITrarrowtail,	bit arrowsBit)
@@ -634,43 +693,148 @@ pop _span _buf _len = do popLexState; lexToken
 pop_and :: Action -> Action
 pop_and act span buf len = do popLexState; act span buf len
 
-notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
+{-# INLINE nextCharIs #-}
+nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
+
+notFollowedBy char _ _ _ (AI _ _ buf) 
+  = nextCharIs buf (/=char)
 
 notFollowedBySymbol _ _ _ (AI _ _ buf)
-  = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
+  = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
+
+isNormalComment bits _ _ (AI _ _ buf)
+  | haddockEnabled bits = notFollowedByDocOrPragma
+  | otherwise           = nextCharIs buf (/='#')
+  where 
+    notFollowedByDocOrPragma 
+	= not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
+
+spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
+
+haddockDisabledAnd p bits _ _ (AI _ _ buf)
+  = if haddockEnabled bits then False else (p buf)
 
 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 
 ifExtension pred bits _ _ _ = pred bits
 
+multiline_doc_comment :: Action
+multiline_doc_comment span buf _len = withLexedDocType (worker "")
+  where
+    worker commentAcc input docType oneLine = case alexGetChar input of
+      Just ('\n', input') 
+        | oneLine -> docCommentEnd input commentAcc docType buf span
+        | otherwise -> case checkIfCommentLine input' of
+          Just input -> worker ('\n':commentAcc) input docType False
+          Nothing -> docCommentEnd input commentAcc docType buf span
+      Just (c, input) -> worker (c:commentAcc) input docType oneLine
+      Nothing -> docCommentEnd input commentAcc docType buf span
+      
+    checkIfCommentLine input = check (dropNonNewlineSpace input)
+      where
+        check input = case alexGetChar input of
+          Just ('-', input) -> case alexGetChar input of
+            Just ('-', input) -> case alexGetChar input of
+              Just (c, _) | c /= '-' -> Just input
+              _ -> Nothing
+            _ -> Nothing
+          _ -> Nothing
+
+        dropNonNewlineSpace input = case alexGetChar input of
+          Just (c, input') 
+            | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
+            | otherwise -> input
+          Nothing -> input
+
 {-
   nested comments require traversing by hand, they can't be parsed
   using regular expressions.
 -}
-nested_comment :: Action
-nested_comment span _str _len = do
+nested_comment :: P (Located Token) -> Action
+nested_comment cont span _str _len = do
   input <- getInput
   go 1 input
-  where go 0 input = do setInput input; lexToken
-	go n input = do
-	  case alexGetChar input of
-	    Nothing  -> err input
-	    Just (c,input) -> do
-	      case c of
-	    	'-' -> do
-		  case alexGetChar input of
-		    Nothing  -> err input
-		    Just ('\125',input) -> go (n-1) input
-		    Just (c,_)          -> go n input
-	     	'\123' -> do
-		  case alexGetChar input of
-		    Nothing  -> err input
-		    Just ('-',input') -> go (n+1) input'
-		    Just (c,input)    -> go n input
-	    	c -> go n input
-
-        err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
-
+  where
+    go 0 input = do setInput input; cont
+    go n input = case alexGetChar input of
+      Nothing -> errBrace input span
+      Just ('-',input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('\125',input) -> go (n-1) input
+        Just (c,_)          -> go n input
+      Just ('\123',input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('-',input) -> go (n+1) input
+        Just (c,_)       -> go n input
+      Just (c,input) -> go n input
+
+nested_doc_comment :: Action
+nested_doc_comment span buf _len = withLexedDocType (go "")
+  where
+    go commentAcc input docType _ = case alexGetChar input of
+      Nothing -> errBrace input span
+      Just ('-',input) -> case alexGetChar input of
+        Nothing -> errBrace input span
+        Just ('\125',input@(AI end _ buf2)) ->
+          docCommentEnd input commentAcc docType buf span
+        Just (c,_) -> go ('-':commentAcc) input docType False
+      Just ('\123', input) -> case alexGetChar input of
+        Nothing  -> errBrace input span
+        Just ('-',input) -> do
+          setInput input
+          let cont = do input <- getInput; go commentAcc input docType False
+          nested_comment cont span buf _len
+        Just (c,_) -> go ('\123':commentAcc) input docType False
+      Just (c,input) -> go (c:commentAcc) input docType False
+
+withLexedDocType lexDocComment = do
+  input <- getInput
+  case alexGetChar input of
+    Nothing -> error "Can't happen"
+    Just ('|', input) -> lexDocComment input ITdocCommentNext False
+    Just ('^', input) -> lexDocComment input ITdocCommentPrev False
+    Just ('$', input) -> lexDocComment input ITdocCommentNamed False
+    Just ('*', input) -> lexDocSection 1 input 
+ where 
+    lexDocSection n input = case alexGetChar input of 
+      Just ('*', input) -> lexDocSection (n+1) input
+      Just (c, _) -> lexDocComment input (ITdocSection n) True
+      Nothing -> do setInput input; lexToken -- eof reached, lex it normally
+
+-- docCommentEnd
+-------------------------------------------------------------------------------
+-- This function is quite tricky. We can't just return a new token, we also
+-- need to update the state of the parser. Why? Because the token is longer
+-- than what was lexed by Alex, and the lexToken function doesn't know this, so 
+-- it writes the wrong token length to the parser state. This function is
+-- called afterwards, so it can just update the state. 
+
+-- This is complicated by the fact that Haddock tokens can span multiple lines, 
+-- which is something that the original lexer didn't account for. 
+-- I have added last_line_len in the parser state which represents the length 
+-- of the part of the token that is on the last line. It is now used for layout 
+-- calculation in pushCurrentContext instead of last_len. last_len is, like it 
+-- was before, the full length of the token, and it is now only used for error
+-- messages. /Waern 
+
+docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
+                 SrcSpan -> P (Located Token) 
+docCommentEnd input commentAcc docType buf span = do
+  setInput input
+  let (AI loc last_offs nextBuf) = input
+      comment = reverse commentAcc
+      span' = mkSrcSpan (srcSpanStart span) loc
+      last_len = byteDiff buf nextBuf
+      
+      last_line_len = if (last_offs - last_len < 0) 
+        then last_offs
+        else last_len  
+  
+  span `seq` setLastToken span' last_len last_line_len
+  return (L span' (docType comment))
+ 
+errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
+ 
 open_brace, close_brace :: Action
 open_brace span _str _len = do 
   ctx <- getContext
@@ -1142,6 +1306,7 @@ getCharOrFail =  do
 data LayoutContext
   = NoLayout
   | Layout !Int
+  deriving Show
 
 data ParseResult a
   = POk PState a
@@ -1158,6 +1323,7 @@ data PState = PState {
 				-- beginning of  the current line.
 				-- \t is equal to 8 spaces.
 	last_len   :: !Int,	-- len of previous token
+  last_line_len :: !Int,
         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
 	extsBitmap :: !Int,	-- bitmap that determines permitted extensions
 	context	   :: [LayoutContext],
@@ -1209,8 +1375,12 @@ setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
 getSrcLoc :: P SrcLoc
 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
 
-setLastToken :: SrcSpan -> Int -> P ()
-setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
+setLastToken :: SrcSpan -> Int -> Int -> P ()
+setLastToken loc len line_len = P $ \s -> POk s { 
+  last_loc=loc, 
+  last_len=len,
+  last_line_len=line_len 
+} ()
 
 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
 
@@ -1311,6 +1481,8 @@ ipBit      = 6
 tvBit	   = 7	-- Scoped type variables enables 'forall' keyword
 bangPatBit = 8	-- Tells the parser to understand bang-patterns
 		-- (doesn't affect the lexer)
+idxTysBit  = 9	-- indexed type families: 'family' keyword and kind sigs
+haddockBit = 10 -- Lex and parse Haddock comments
 
 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
 glaExtsEnabled flags = testBit flags glaExtsBit
@@ -1321,20 +1493,23 @@ thEnabled      flags = testBit flags thBit
 ipEnabled      flags = testBit flags ipBit
 tvEnabled      flags = testBit flags tvBit
 bangPatEnabled flags = testBit flags bangPatBit
+idxTysEnabled  flags = testBit flags idxTysBit
+haddockEnabled flags = testBit flags haddockBit
 
 -- PState for parsing options pragmas
 --
 pragState :: StringBuffer -> SrcLoc -> PState
 pragState buf loc  = 
   PState {
-      buffer	 = buf,
-      last_loc   = mkSrcSpan loc loc,
-      last_offs  = 0,
-      last_len   = 0,
-      loc        = loc,
-      extsBitmap = 0,
-      context    = [],
-      lex_state  = [bol, option_prags, 0]
+      buffer	      = buf,
+      last_loc      = mkSrcSpan loc loc,
+      last_offs     = 0,
+      last_len      = 0,
+      last_line_len = 0,
+      loc           = loc,
+      extsBitmap    = 0,
+      context       = [],
+      lex_state     = [bol, option_prags, 0]
     }
 
 
@@ -1343,14 +1518,15 @@ pragState buf loc  =
 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
 mkPState buf loc flags  = 
   PState {
-      buffer	 = buf,
-      last_loc   = mkSrcSpan loc loc,
-      last_offs  = 0,
-      last_len   = 0,
-      loc        = loc,
-      extsBitmap = fromIntegral bitmap,
-      context    = [],
-      lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
+      buffer	      = buf,
+      last_loc      = mkSrcSpan loc loc,
+      last_offs     = 0,
+      last_len      = 0,
+      last_line_len = 0,
+      loc           = loc,
+      extsBitmap    = fromIntegral bitmap,
+      context       = [],
+      lex_state     = [bol, if glaExtsEnabled bitmap then glaexts else 0]
 	-- we begin in the layout state if toplev_layout is set
     }
     where
@@ -1362,6 +1538,8 @@ mkPState buf loc flags  =
 	       .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
 	       .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
 	       .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
+	       .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
+	       .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
       --
       setBitIf :: Int -> Bool -> Int
       b `setBitIf` cond | cond      = bit b
@@ -1384,8 +1562,9 @@ popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
 -- This is only used at the outer level of a module when the 'module'
 -- keyword is missing.
 pushCurrentContext :: P ()
-pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
-  POk s{context = Layout (offs-len) : ctx} ()
+pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } -> 
+    POk s{context = Layout (offs-len) : ctx} ()
+--trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
 
 getOffside :: P Ordering
 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
@@ -1431,8 +1610,8 @@ lexError str = do
 
 lexer :: (Located Token -> P a) -> P a
 lexer cont = do
-  tok@(L _ tok__) <- lexToken
-  --trace ("token: " ++ show tok__) $ do
+  tok@(L span tok__) <- lexToken
+--  trace ("token: " ++ show tok__) $ do
   cont tok
 
 lexToken :: P (Located Token)
@@ -1442,7 +1621,7 @@ lexToken = do
   exts <- getExts
   case alexScanUser exts inp sc of
     AlexEOF -> do let span = mkSrcSpan loc1 loc1
-		  setLastToken span 0
+		  setLastToken span 0 0
 		  return (L span ITeof)
     AlexError (AI loc2 _ buf) -> do 
 	reportLexError loc1 loc2 buf "lexical error"
@@ -1450,11 +1629,11 @@ lexToken = do
 	setInput inp2
 	lexToken
     AlexToken inp2@(AI end _ buf2) len t -> do
-	setInput inp2
-	let span = mkSrcSpan loc1 end
-	let bytes = byteDiff buf buf2
-	span `seq` setLastToken span bytes
-	t span buf bytes
+    setInput inp2
+    let span = mkSrcSpan loc1 end
+    let bytes = byteDiff buf buf2
+    span `seq` setLastToken span bytes bytes
+    t span buf bytes
 
 reportLexError loc1 loc2 buf str
   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")