RichTokenStream support

author Chaddai Fouche <chaddai.fouche@gmail.com>

Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)

committer Chaddai Fouche <chaddai.fouche@gmail.com>

Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)
author Chaddai Fouche <chaddai.fouche@gmail.com>
Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)
committer Chaddai Fouche <chaddai.fouche@gmail.com>
Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs

index 114a523..ec9bca8 100644 (file)
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -296,6 +296,7 @@ data DynFlag
     | Opt_KeepSFiles
     | Opt_KeepRawSFiles
     | Opt_KeepTmpFiles
+   | Opt_KeepRawTokenStream
  
     deriving (Eq, Show)
  
diff --git a/compiler/main/GHC.hs b/compiler/main/GHC.hs

index 472f587..766ed01 100644 (file)
--- a/compiler/main/GHC.hs
+++ b/compiler/main/GHC.hs
@@ -198,6 +198,11 @@ module GHC (
         -- * Exceptions
         GhcException(..), showGhcException,
  
+        -- * Token stream manipulations
+        Token,
+        getTokenStream, getRichTokenStream,
+        showRichTokenStream, addSourceToTokens,
+
         -- * Miscellaneous
         --sessionHscEnv,
         cyclicModuleErr,
@@ -269,13 +274,14 @@ import Bag                ( unitBag, listToBag, emptyBag, isEmptyBag )
  import ErrUtils
  import MonadUtils
  import Util
-import StringBuffer    ( StringBuffer, hGetStringBuffer )
+import StringBuffer    ( StringBuffer, hGetStringBuffer, nextChar )
  import Outputable
  import BasicTypes
  import Maybes          ( expectJust, mapCatMaybes )
  import HaddockParse
  import HaddockLex       ( tokenise )
  import FastString
+import Lexer
  
  import Control.Concurrent
  import System.Directory ( getModificationTime, doesFileExist,
@@ -1717,7 +1723,7 @@ topSortModuleGraph
  --
  --   True:     eliminate the hi-boot nodes, and instead pretend
  --             the a source-import of Foo is an import of Foo
---             The resulting graph has no hi-boot nodes, but can by cyclic
+--             The resulting graph has no hi-boot nodes, but can be cyclic
  
  topSortModuleGraph drop_hs_boot_nodes summaries mb_root_mod
    = map (fmap summaryNodeSummary) $ stronglyConnCompG initial_graph
@@ -2440,12 +2446,85 @@ pprParenSymName a = parenSymOcc (getOccName a) (ppr (getName a))
  -- :browse will use either lm_toplev or inspect lm_interface, depending
  -- on whether the module is interpreted or not.
  
--- This is for reconstructing refactored source code
--- Calls the lexer repeatedly.
--- ToDo: add comment tokens to token stream
-getTokenStream :: Session -> Module -> IO [Located Token]
  #endif
  
+-- Extract the filename, stringbuffer content and dynflags associed to a module
+--
+-- XXX: Explain pre-conditions
+getModuleSourceAndFlags :: GhcMonad m => Module -> m (String, StringBuffer, DynFlags)
+getModuleSourceAndFlags mod = do
+  m <- getModSummary (moduleName mod)
+  case ml_hs_file $ ms_location m of
+    Nothing -> throw $ mkApiErr (text "No source available for module " <+> ppr mod)
+    Just sourceFile -> do
+        source <- liftIO $ hGetStringBuffer sourceFile
+        return (sourceFile, source, ms_hspp_opts m)
+
+
+-- | Return module source as token stream, including comments.
+--
+-- The module must be in the module graph and its source must be available.
+-- Throws a 'HscTypes.SourceError' on parse error.
+getTokenStream :: GhcMonad m => Module -> m [Located Token]
+getTokenStream mod = do
+  (sourceFile, source, flags) <- getModuleSourceAndFlags mod
+  let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0
+  case lexTokenStream source startLoc flags of
+    POk _ ts  -> return ts
+    PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err)
+
+-- | Give even more information on the source than 'getTokenStream'
+-- This function allows reconstructing the source completely with
+-- 'showRichTokenStream'.
+getRichTokenStream :: GhcMonad m => Module -> m [(Located Token, String)]
+getRichTokenStream mod = do
+  (sourceFile, source, flags) <- getModuleSourceAndFlags mod
+  let startLoc = mkSrcLoc (mkFastString sourceFile) 0 0
+  case lexTokenStream source startLoc flags of
+    POk _ ts -> return $ addSourceToTokens startLoc source ts
+    PFailed span err -> throw $ mkSrcErr (unitBag $ mkPlainErrMsg span err)
+
+-- | Given a source location and a StringBuffer corresponding to this
+-- location, return a rich token stream with the source associated to the
+-- tokens.
+addSourceToTokens :: SrcLoc -> StringBuffer -> [Located Token]
+                  -> [(Located Token, String)]
+addSourceToTokens _ _ [] = []
+addSourceToTokens loc buf (t@(L span _) : ts)
+    | not (isGoodSrcSpan span) = (t,"") : addSourceToTokens loc buf ts
+    | otherwise = (t,str) : addSourceToTokens newLoc newBuf ts
+    where
+      (newLoc, newBuf, str) = go "" loc buf
+      start = srcSpanStart span
+      end = srcSpanEnd span
+      go acc loc buf | loc < start = go acc nLoc nBuf
+                     | start <= loc && loc < end = go (ch:acc) nLoc nBuf
+                     | otherwise = (loc, buf, reverse acc)
+          where (ch, nBuf) = nextChar buf
+                nLoc = advanceSrcLoc loc ch
+
+
+-- | Take a rich token stream such as produced from 'getRichTokenStream' and
+-- return source code almost identical to the original code (except for
+-- insignificant whitespace.)
+showRichTokenStream :: [(Located Token, String)] -> String
+showRichTokenStream ts = go startLoc ts ""
+    where sourceFile = srcSpanFile (getLoc . fst . head $ ts)
+          startLoc = mkSrcLoc sourceFile 0 0
+          go _ [] = id
+          go loc ((L span _, str):ts)
+              | not (isGoodSrcSpan span) = go loc ts
+              | locLine == tokLine = ((replicate (tokCol - locCol) ' ') ++)
+                                     . (str ++)
+                                     . go tokEnd ts
+              | otherwise = ((replicate (tokLine - locLine) '\n') ++)
+                            . ((replicate tokCol ' ') ++)
+                            . (str ++)
+                            . go tokEnd ts
+              where (locLine, locCol) = (srcLocLine loc, srcLocCol loc)
+                    (tokLine, tokCol) = (srcSpanStartLine span, srcSpanStartCol span)
+                    tokEnd = srcSpanEnd span
+
  -- -----------------------------------------------------------------------------
  -- Interactive evaluation
  
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x

index 66f4fe5..613848a 100644 (file)
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -41,7 +41,8 @@ module Lexer (
     popContext, pushCurrentContext, setLastToken, setSrcLoc,
     getLexState, popLexState, pushLexState,
     extension, standaloneDerivingEnabled, bangPatEnabled,
-   addWarning
+   addWarning,
+   lexTokenStream
    ) where
  
  import Bag
@@ -148,12 +149,12 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  -- space followed by a Haddock comment symbol (docsym) (in which case we'd
  -- have a Haddock comment). The rules then munch the rest of the line.
  
-"-- " ~[$docsym \#] .* ;
-"--" [^$symbol : \ ] .* ;
+"-- " ~[$docsym \#] .* { lineCommentToken }
+"--" [^$symbol : \ ] .* { lineCommentToken }
  
  -- Next, match Haddock comments if no -haddock flag
  
-"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ;
+"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken }
  
  -- Now, when we've matched comments that begin with 2 dashes and continue
  -- with a different character, we need to match comments that begin with three
@@ -161,17 +162,17 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  -- make sure that the first non-dash character isn't a symbol, and munch the
  -- rest of the line.
  
-"---"\-* [^$symbol :] .* ;
+"---"\-* [^$symbol :] .* { lineCommentToken }
  
  -- Since the previous rules all match dashes followed by at least one
  -- character, we also need to match a whole line filled with just dashes.
  
-"--"\-* / { atEOL } ;
+"--"\-* / { atEOL } { lineCommentToken }
  
  -- We need this rule since none of the other single line comment rules
  -- actually match this case.
  
-"-- " / { atEOL } ;
+"-- " / { atEOL } { lineCommentToken }
  
  -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
  -- blank lines) until we find a non-whitespace character, then do layout
@@ -277,7 +278,7 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
  }
  
  <0> {
-  "-- #" .* ;
+  "-- #" .* { lineCommentToken }
  }
  
  <0,option_prags> {
@@ -575,6 +576,8 @@ data Token
    | ITdocSection      Int String -- a section heading
    | ITdocOptions      String     -- doc options (prune, ignore-exports, etc)
    | ITdocOptionsOld   String     -- doc options declared "-- # ..."-style
+  | ITlineComment     String     -- comment starting by "--"
+  | ITblockComment    String     -- comment in {- -}
  
  #ifdef DEBUG
    deriving Show -- debugging
@@ -802,6 +805,11 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
              | otherwise -> input
            Nothing -> input
  
+lineCommentToken :: Action
+lineCommentToken span buf len = do
+  b <- extension rawTokenStreamEnabled
+  if b then strtoken ITlineComment span buf len else lexToken
+
  {-
    nested comments require traversing by hand, they can't be parsed
    using regular expressions.
@@ -809,20 +817,24 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "")
  nested_comment :: P (Located Token) -> Action
  nested_comment cont span _str _len = do
    input <- getInput
-  go (1::Int) input
+  go "" (1::Int) input
    where
-    go 0 input = do setInput input; cont
-    go n input = case alexGetChar input of
+    go commentAcc 0 input = do setInput input
+                               b <- extension rawTokenStreamEnabled
+                               if b
+                                 then docCommentEnd input commentAcc ITblockComment _str span
+                                 else cont
+    go commentAcc n input = case alexGetChar input of
        Nothing -> errBrace input span
        Just ('-',input) -> case alexGetChar input of
          Nothing  -> errBrace input span
-        Just ('\125',input) -> go (n-1) input
-        Just (_,_)          -> go n input
+        Just ('\125',input) -> go commentAcc (n-1) input
+        Just (_,_)          -> go ('-':commentAcc) n input
        Just ('\123',input) -> case alexGetChar input of
          Nothing  -> errBrace input span
-        Just ('-',input) -> go (n+1) input
-        Just (_,_)       -> go n input
-      Just (_,input) -> go n input
+        Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input
+        Just (_,_)       -> go ('\123':commentAcc) n input
+      Just (c,input) -> go (c:commentAcc) n input
  
  nested_doc_comment :: Action
  nested_doc_comment span buf _len = withLexedDocType (go "")
@@ -1596,6 +1608,7 @@ standaloneDerivingBit = 16 -- standalone instance deriving declarations
  transformComprehensionsBit = 17
  qqBit     = 18 -- enable quasiquoting
  inRulePragBit = 19
+rawTokenStreamBit = 20 -- producing a token stream with all comments included
  
  genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
  always           _     = True
@@ -1618,6 +1631,7 @@ standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
  transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
  qqEnabled        flags = testBit flags qqBit
  inRulePrag       flags = testBit flags inRulePragBit
+rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit
  
  -- PState for parsing options pragmas
  --
@@ -1679,7 +1693,8 @@ mkPState buf loc flags  =
                .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
                .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
                .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
-           .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
+               .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
+               .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags
        --
        setBitIf :: Int -> Bool -> Int
        b `setBitIf` cond | cond      = bit b
@@ -1795,4 +1810,13 @@ reportLexError loc1 loc2 buf str
    if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
      then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
      else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
+
+lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token]
+lexTokenStream buf loc dflags = unP go initState
+    where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream)
+          go = do
+            ltok <- lexer return
+            case ltok of
+              L _ ITeof -> return []
+              _ -> liftM (ltok:) go
  }
author	Chaddai Fouche <chaddai.fouche@gmail.com>
	Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)
committer	Chaddai Fouche <chaddai.fouche@gmail.com>
	Thu, 18 Sep 2008 16:52:56 +0000 (16:52 +0000)
compiler/main/DynFlags.hs		patch \| blob \| history
compiler/main/GHC.hs		patch \| blob \| history
compiler/parser/Lexer.x		patch \| blob \| history