X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=compiler%2Fparser%2FLexer.x;h=f06624e77ee654465e3ba4d5b3ea48a1df67ab14;hb=8b4b45b96466be65f4e23c46c20c2199b6ae6c29;hp=4042a9c518edaba2530445a741b9afccfa85549d;hpb=d19a72ea089deab3aa4bb584e69c102daebb1cb4;p=ghc-hetmet.git

diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index 4042a9c..f06624e 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -31,6 +31,8 @@
 -- Note that Alex itself generates code with with some unused bindings and
 -- without type signatures, so removing the flag might not be possible.
 
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
 module Lexer (
    Token(..), lexer, pragState, mkPState, PState(..),
    P(..), ParseResult(..), getSrcLoc, 
@@ -42,8 +44,6 @@ module Lexer (
    addWarning
   ) where
 
-#include "HsVersions.h"
-
 import Bag
 import ErrUtils
 import Outputable
@@ -62,15 +62,11 @@ import Data.Char 	( chr, ord, isSpace )
 import Data.Ratio
 import Debug.Trace
 
-#if __GLASGOW_HASKELL__ >= 605
-import Data.Char 	( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#else
-import Compat.Unicode	( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#endif
+import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
 }
 
 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
-$whitechar   = [\ \n\r\f\v\xa0 $unispace]
+$whitechar   = [\ \n\r\f\v $unispace]
 $white_no_nl = $whitechar # \n
 $tab         = \t
 
@@ -80,16 +76,16 @@ $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
 $digit     = [$ascdigit $unidigit]
 
 $special   = [\(\)\,\;\[\]\`\{\}]
-$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
 
 $unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
-$asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
+$asclarge  = [A-Z]
 $large     = [$asclarge $unilarge]
 
 $unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
-$ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
+$ascsmall  = [a-z]
 $small     = [$ascsmall $unismall \_]
 
 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
@@ -231,16 +227,8 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
    -- with older versions of GHC which generated these.
 
--- We only want RULES pragmas to be picked up when explicit forall
--- syntax is enabled is on, because the contents of the pragma always
--- uses it. If it's not on then we're sure to get a parse error.
--- (ToDo: we should really emit a warning when ignoring pragmas)
--- XXX Now that we can enable this without the -fglasgow-exts hammer,
--- is it better just to let the parse error happen?
-<0>
-  "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
-
 <0,option_prags> {
+  "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
   "{-#" $whitechar* (INLINE|inline)	{ token (ITinline_prag True) }
   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
   					{ token (ITinline_prag False) }
@@ -252,6 +240,8 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
 	$whitechar* (NO(T?)INLINE|no(t?)inline)
 					{ token (ITspec_inline_prag False) }
   "{-#" $whitechar* (SOURCE|source)	{ token ITsource_prag }
+  "{-#" $whitechar* (WARNING|warning)
+  					{ token ITwarning_prag }
   "{-#" $whitechar* (DEPRECATED|deprecated)
   					{ token ITdeprecated_prag }
   "{-#" $whitechar* (SCC|scc)		{ token ITscc_prag }
@@ -260,10 +250,13 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
   "{-#" $whitechar* (CORE|core)		{ token ITcore_prag }
   "{-#" $whitechar* (UNPACK|unpack)	{ token ITunpack_prag }
 
- "{-#"                                 { nested_comment lexToken }
+  -- We ignore all these pragmas, but don't generate a warning for them
+  -- CFILES is a hugs-only thing.
+  "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
+                    { nested_comment lexToken }
 
   -- ToDo: should only be valid inside a pragma:
-  "#-}" 				{ token ITclose_prag}
+  "#-}" 				{ endPrag }
 }
 
 <option_prags> {
@@ -278,12 +271,18 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
 }
 
 <0> {
+  -- In the "0" mode we ignore these pragmas
+  "{-#"  $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
+                     { nested_comment lexToken }
+}
+
+<0> {
   "-- #" .* ;
 }
 
 <0,option_prags> {
-	-- This is to catch things like {-# OPTIONS OPTIONS_HUGS ... 
-  "{-#" $whitechar* $idchar+		{ nested_comment lexToken }
+  "{-#"  { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
+                    (nested_comment lexToken) }
 }
 
 -- '0' state: ordinary lexemes
@@ -378,25 +377,29 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
 -- when trying to be close to Haskell98
 <0> {
   -- Normal integral literals (:: Num a => a, from Integer)
-  @decimal			{ tok_num positive 0 0 decimal }
-  0[oO] @octal			{ tok_num positive 2 2 octal }
-  0[xX] @hexadecimal		{ tok_num positive 2 2 hexadecimal }
+  @decimal           { tok_num positive 0 0 decimal }
+  0[oO] @octal       { tok_num positive 2 2 octal }
+  0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
 
   -- Normal rational literals (:: Fractional a => a, from Rational)
-  @floating_point		{ strtoken tok_float }
+  @floating_point    { strtoken tok_float }
 }
 
 <0> {
-  -- Unboxed ints (:: Int#)
+  -- Unboxed ints (:: Int#) and words (:: Word#)
   -- It's simpler (and faster?) to give separate cases to the negatives,
   -- especially considering octal/hexadecimal prefixes.
-  @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
-  0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
-  0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
-  @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
-  @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
+  @decimal                     \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
+  0[oO] @octal                 \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
+  0[xX] @hexadecimal           \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
+  @negative @decimal           \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
+  @negative 0[oO] @octal       \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
   @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
 
+  @decimal                     \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
+  0[oO] @octal                 \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
+  0[xX] @hexadecimal           \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
+
   -- Unboxed floats and doubles (:: Float#, :: Double#)
   -- prim_{float,double} work with signed literals
   @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
@@ -466,6 +469,7 @@ data Token
   | ITspec_inline_prag Bool	-- SPECIALISE INLINE (or NOINLINE)
   | ITsource_prag
   | ITrules_prag
+  | ITwarning_prag
   | ITdeprecated_prag
   | ITline_prag
   | ITscc_prag
@@ -502,8 +506,8 @@ data Token
   | ITvocurly
   | ITvccurly
   | ITobrack
-  | ITopabrack			-- [:, for parallel arrays with -fparr
-  | ITcpabrack			-- :], for parallel arrays with -fparr
+  | ITopabrack			-- [:, for parallel arrays with -XParr
+  | ITcpabrack			-- :], for parallel arrays with -XParr
   | ITcbrack
   | IToparen
   | ITcparen
@@ -535,6 +539,7 @@ data Token
   | ITprimchar   Char
   | ITprimstring FastString
   | ITprimint    Integer
+  | ITprimword   Integer
   | ITprimfloat  Rational
   | ITprimdouble Rational
 
@@ -636,7 +641,7 @@ reservedWordsFM = listToUFM $
 	( "where",	ITwhere, 	0 ),
 	( "_scc_",	ITscc, 		0 ),		-- ToDo: remove
 
-    ( "forall",	ITforall,	 bit explicitForallBit),
+    ( "forall",	ITforall,	 bit explicitForallBit .|. bit inRulePragBit),
 	( "mdo",	ITmdo,		 bit recursiveDoBit),
 	( "family",	ITfamily,	 bit tyFamBit),
     ( "group",  ITgroup,     bit transformComprehensionsBit),
@@ -679,7 +684,7 @@ reservedSymsFM = listToUFM $
         -- For data T (a::*) = MkT
        ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
         -- For 'forall a . t'
-       ,(".", ITdot, explicitForallEnabled)
+       ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
 
        ,("-<",  ITlarrowtail, arrowsEnabled)
        ,(">-",  ITrarrowtail, arrowsEnabled)
@@ -852,6 +857,18 @@ withLexedDocType lexDocComment = do
       Just (_,   _)     -> lexDocComment input (ITdocSection n) True
       Nothing -> do setInput input; lexToken -- eof reached, lex it normally
 
+-- RULES pragmas turn on the forall and '.' keywords, and we turn them
+-- off again at the end of the pragma.
+rulePrag :: Action
+rulePrag span buf len = do
+  setExts (.|. inRulePragBit)
+  return (L span ITrules_prag)
+
+endPrag :: Action
+endPrag span buf len = do
+  setExts (.&. complement (bit inRulePragBit))
+  return (L span ITclose_prag)
+
 -- docCommentEnd
 -------------------------------------------------------------------------------
 -- This function is quite tricky. We can't just return a new token, we also
@@ -973,6 +990,7 @@ tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
 -- some conveniences for use with tok_integral
 tok_num = tok_integral ITinteger
 tok_primint = tok_integral ITprimint
+tok_primword = tok_integral ITprimword positive
 positive = id
 negative = negate
 decimal = (10,octDecDigit)
@@ -1214,7 +1232,7 @@ lex_char c inp = do
       c | isAny c -> do setInput inp; return c
       _other -> lit_error
 
-isAny c | c > '\xff' = isPrint c
+isAny c | c > '\x7f' = isPrint c
 	| otherwise  = is_any c
 
 lex_escape :: P Char
@@ -1238,7 +1256,7 @@ lex_escape = do
 
 	'x'   -> readNum is_hexdigit 16 hexDigit
 	'o'   -> readNum is_octdigit  8 octDecDigit
-	x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
+	x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
 
 	c1 ->  do
 	   i <- getInput
@@ -1373,6 +1391,11 @@ warn option warning srcspan _buf _len = do
     addWarning option srcspan warning
     lexToken
 
+warnThen :: DynFlag -> SDoc -> Action -> Action
+warnThen option warning action srcspan buf len = do
+    addWarning option srcspan warning
+    action srcspan buf len
+
 -- -----------------------------------------------------------------------------
 -- The Parse Monad
 
@@ -1444,6 +1467,9 @@ extension p = P $ \s -> POk s (p $! extsBitmap s)
 getExts :: P Int
 getExts = P $ \s -> POk s (extsBitmap s)
 
+setExts :: (Int -> Int) -> P ()
+setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
+
 setSrcLoc :: SrcLoc -> P ()
 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
 
@@ -1482,7 +1508,7 @@ alexGetChar (AI loc ofs s)
 
 	adj_c 
 	  | c <= '\x06' = non_graphic
-	  | c <= '\xff' = c
+	  | c <= '\x7f' = c
           -- Alex doesn't handle Unicode, so when Unicode
           -- character is encoutered we output these values
           -- with the actual character value hidden in the state.
@@ -1492,7 +1518,7 @@ alexGetChar (AI loc ofs s)
 		  LowercaseLetter       -> lower
 		  TitlecaseLetter       -> upper
 		  ModifierLetter        -> other_graphic
-		  OtherLetter           -> other_graphic
+		  OtherLetter           -> lower -- see #1103
 		  NonSpacingMark        -> other_graphic
 		  SpacingCombiningMark  -> other_graphic
 		  EnclosingMark         -> other_graphic
@@ -1546,7 +1572,7 @@ getLexState :: P Int
 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
 
 -- for reasons of efficiency, flags indicating language extensions (eg,
--- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
+-- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
 -- integer
 
 genericsBit, ffiBit, parrBit :: Int
@@ -1561,7 +1587,7 @@ bangPatBit = 8	-- Tells the parser to understand bang-patterns
 		-- (doesn't affect the lexer)
 tyFamBit   = 9	-- indexed type families: 'family' keyword and kind sigs
 haddockBit = 10 -- Lex and parse Haddock comments
-magicHashBit = 11 -- # in both functions and operators
+magicHashBit = 11 -- "#" in both functions and operators
 kindSigsBit = 12 -- Kind signatures on type variables
 recursiveDoBit = 13 -- mdo
 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
@@ -1569,6 +1595,7 @@ unboxedTuplesBit = 15 -- (# and #)
 standaloneDerivingBit = 16 -- standalone instance deriving declarations
 transformComprehensionsBit = 17
 qqBit	   = 18 -- enable quasiquoting
+inRulePragBit = 19
 
 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
 always           _     = True
@@ -1590,17 +1617,16 @@ unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
 qqEnabled        flags = testBit flags qqBit
+inRulePrag       flags = testBit flags inRulePragBit
 
 -- PState for parsing options pragmas
 --
-pragState :: StringBuffer -> SrcLoc -> PState
-pragState buf loc  = 
+pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
+pragState dynflags buf loc =
   PState {
-      buffer	      = buf,
+      buffer        = buf,
       messages      = emptyMessages,
-      -- XXX defaultDynFlags is not right, but we don't have a real
-      -- dflags handy
-      dflags        = defaultDynFlags,
+      dflags        = dynflags,
       last_loc      = mkSrcSpan loc loc,
       last_offs     = 0,
       last_len      = 0,
@@ -1705,8 +1731,8 @@ srcParseErr
   -> Message
 srcParseErr buf len
   = hcat [ if null token 
-	     then ptext SLIT("parse error (possibly incorrect indentation)")
-	     else hcat [ptext SLIT("parse error on input "),
+	     then ptext (sLit "parse error (possibly incorrect indentation)")
+	     else hcat [ptext (sLit "parse error on input "),
           	  	char '`', text token, char '\'']
     ]
   where token = lexemeToString (offsetBytes (-len) buf) len