X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=compiler%2Fparser%2FLexer.x;h=db48dbe7c6967fdaf42c54c45d0bc91cf881500b;hb=2eb04ca0f8d0ec72b417cddc60672c696b4a3daa;hp=d9a0fb08b0e28e578d7a115ce0995fb14f90ce7b;hpb=317fc69d18eda68fd65f5ba634feafbe4a3923da;p=ghc-hetmet.git

diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index d9a0fb0..db48dbe 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -58,30 +58,30 @@ import Compat.Unicode	( GeneralCategory(..), generalCategory, isPrint, isUpper )
 #endif
 }
 
-$unispace    = \x05
+$unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
 $whitechar   = [\ \n\r\f\v\xa0 $unispace]
 $white_no_nl = $whitechar # \n
 $tab         = \t
 
 $ascdigit  = 0-9
-$unidigit  = \x03
+$unidigit  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
 $digit     = [$ascdigit $unidigit]
 
 $special   = [\(\)\,\;\[\]\`\{\}]
 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
-$unisymbol = \x04
+$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
 
-$unilarge  = \x01
+$unilarge  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
 $large     = [$asclarge $unilarge]
 
-$unismall  = \x02
+$unismall  = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
 $small     = [$ascsmall $unismall \_]
 
-$unigraphic = \x06
+$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
 
 $octit	   = 0-7
@@ -108,6 +108,11 @@ $docsym    = [\| \^ \* \$]
 
 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 
+-- normal signed numerical literals can only be explicitly negative,
+-- not explicitly positive (contrast @exponent)
+@negative = \-
+@signed = @negative ?
+
 haskell :-
 
 -- everywhere: skip whitespace and comments
@@ -271,8 +276,8 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
 -- Haddock comments
 
 <0,glaexts> {
-  "-- " / $docsym    { multiline_doc_comment }
-  "{-" \ ? / $docsym { nested_doc_comment }
+  "-- " $docsym    / { ifExtension haddockEnabled } { multiline_doc_comment }
+  "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
 }
 
 -- "special" symbols
@@ -353,22 +358,35 @@ $tab+         { warn Opt_WarnTabs (text "Tab character") }
   @consym			{ consym }
 }
 
+-- For the normal boxed literals we need to be careful
+-- when trying to be close to Haskell98
 <0,glaexts> {
-  @decimal			{ tok_decimal }
-  0[oO] @octal			{ tok_octal }
-  0[xX] @hexadecimal		{ tok_hexadecimal }
+  -- Normal integral literals (:: Num a => a, from Integer)
+  @decimal			{ tok_num positive 0 0 decimal }
+  0[oO] @octal			{ tok_num positive 2 2 octal }
+  0[xX] @hexadecimal		{ tok_num positive 2 2 hexadecimal }
+
+  -- Normal rational literals (:: Fractional a => a, from Rational)
+  @floating_point		{ strtoken tok_float }
 }
 
 <glaexts> {
-  @decimal \#			{ prim_decimal }
-  0[oO] @octal \#		{ prim_octal }
-  0[xX] @hexadecimal \#		{ prim_hexadecimal }
+  -- Unboxed ints (:: Int#)
+  -- It's simpler (and faster?) to give separate cases to the negatives,
+  -- especially considering octal/hexadecimal prefixes.
+  @decimal \#			{ tok_primint positive 0 1 decimal }
+  0[oO] @octal \#		{ tok_primint positive 2 3 octal }
+  0[xX] @hexadecimal \#		{ tok_primint positive 2 3 hexadecimal }
+  @negative @decimal \#			{ tok_primint negative 1 2 decimal }
+  @negative 0[oO] @octal \#		{ tok_primint negative 3 4 octal }
+  @negative 0[xX] @hexadecimal \#	{ tok_primint negative 3 4 hexadecimal }
+
+  -- Unboxed floats and doubles (:: Float#, :: Double#)
+  -- prim_{float,double} work with signed literals
+  @signed @floating_point \#		{ init_strtoken 1 tok_primfloat }
+  @signed @floating_point \# \#		{ init_strtoken 2 tok_primdouble }
 }
 
-<0,glaexts> @floating_point		{ strtoken tok_float }
-<glaexts>   @floating_point \#		{ init_strtoken 1 prim_float }
-<glaexts>   @floating_point \# \#	{ init_strtoken 2 prim_double }
-
 -- Strings and chars are lexed by hand-written code.  The reason is
 -- that even if we recognise the string or char here in the regex
 -- lexer, we would still have to parse the string afterward in order
@@ -597,7 +615,7 @@ reservedWordsFM = listToUFM $
 
       	( "forall",	ITforall,	 bit tvBit),
 	( "mdo",	ITmdo,		 bit glaExtsBit),
-	( "family",	ITfamily,	 bit idxTysBit),
+	( "family",	ITfamily,	 bit tyFamBit),
 
 	( "foreign",	ITforeign,	 bit ffiBit),
 	( "export",	ITexport,	 bit ffiBit),
@@ -632,7 +650,7 @@ reservedSymsFM = listToUFM $
        ,("!",	ITbang, 	0)
 
        ,("*",	ITstar,		bit glaExtsBit .|. 
-				bit idxTysBit)	    -- For data T (a::*) = MkT
+				bit tyFamBit)	    -- For data T (a::*) = MkT
        ,(".",	ITdot,		bit tvBit)	    -- For 'forall a . t'
 
        ,("-<",	ITlarrowtail,	bit arrowsBit)
@@ -646,7 +664,7 @@ reservedSymsFM = listToUFM $
        ,("â",	ITforall,	bit glaExtsBit)
        ,("â",   ITrarrow,	bit glaExtsBit)
        ,("â",   ITlarrow,	bit glaExtsBit)
-       ,("?", 	ITdotdot,	bit glaExtsBit)
+       ,("â¯", 	ITdotdot,	bit glaExtsBit)
         -- ToDo: ideally, â and â· should be "specials", so that they cannot
         -- form part of a large operator.  This would let us have a better
         -- syntax for kinds: Éâ·*â* would be a legal kind signature. (maybe).
@@ -699,12 +717,17 @@ notFollowedBy char _ _ _ (AI _ _ buf)
 notFollowedBySymbol _ _ _ (AI _ _ buf)
   = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
 
+-- We must reject doc comments as being ordinary comments everywhere.
+-- In some cases the doc comment will be selected as the lexeme due to
+-- maximal munch, but not always, because the nested comment rule is
+-- valid in all states, but the doc-comment rules are only valid in
+-- the non-layout states.
 isNormalComment bits _ _ (AI _ _ buf)
   | haddockEnabled bits = notFollowedByDocOrPragma
   | otherwise           = nextCharIs buf (/='#')
-  where 
-    notFollowedByDocOrPragma 
-	= not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
+  where
+    notFollowedByDocOrPragma
+       = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
 
 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
 
@@ -785,13 +808,12 @@ nested_doc_comment span buf _len = withLexedDocType (go "")
       Just (c,input) -> go (c:commentAcc) input docType False
 
 withLexedDocType lexDocComment = do
-  input <- getInput
-  case alexGetChar input of
-    Nothing -> error "Can't happen"
-    Just ('|', input) -> lexDocComment input ITdocCommentNext False
-    Just ('^', input) -> lexDocComment input ITdocCommentPrev False
-    Just ('$', input) -> lexDocComment input ITdocCommentNamed False
-    Just ('*', input) -> lexDocSection 1 input 
+  input@(AI _ _ buf) <- getInput
+  case prevChar buf ' ' of
+    '|' -> lexDocComment input ITdocCommentNext False
+    '^' -> lexDocComment input ITdocCommentPrev False
+    '$' -> lexDocComment input ITdocCommentNamed False
+    '*' -> lexDocSection 1 input 
  where 
     lexDocSection n input = case alexGetChar input of 
       Just ('*', input) -> lexDocSection (n+1) input
@@ -930,27 +952,29 @@ sym con span buf len =
   where
 	fs = lexemeToFastString buf len
 
-tok_decimal span buf len 
-  = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
-
-tok_octal span buf len 
-  = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
-
-tok_hexadecimal span buf len 
-  = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
-
-prim_decimal span buf len 
-  = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
-
-prim_octal span buf len 
-  = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
-
-prim_hexadecimal span buf len 
-  = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
-
+-- Variations on the integral numeric literal.
+tok_integral :: (Integer -> Token)
+     -> (Integer -> Integer)
+ --    -> (StringBuffer -> StringBuffer) -> (Int -> Int)
+     -> Int -> Int
+     -> (Integer, (Char->Int)) -> Action
+tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
+  return $ L span $ itint $! transint $ parseUnsignedInteger
+     (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
+
+-- some conveniences for use with tok_integral
+tok_num = tok_integral ITinteger
+tok_primint = tok_integral ITprimint
+positive = id
+negative = negate
+decimal = (10,octDecDigit)
+octal = (8,octDecDigit)
+hexadecimal = (16,hexDigit)
+
+-- readRational can understand negative rationals, exponents, everything.
 tok_float        str = ITrational   $! readRational str
-prim_float       str = ITprimfloat  $! readRational str
-prim_double      str = ITprimdouble $! readRational str
+tok_primfloat    str = ITprimfloat  $! readRational str
+tok_primdouble   str = ITprimdouble $! readRational str
 
 -- -----------------------------------------------------------------------------
 -- Layout processing
@@ -1018,7 +1042,7 @@ do_layout_left span _buf _len = do
 
 setLine :: Int -> Action
 setLine code span buf len = do
-  let line = parseInteger buf len 10 octDecDigit
+  let line = parseUnsignedInteger buf len 10 octDecDigit
   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 	-- subtract one: the line number refers to the *following* line
   popLexState
@@ -1415,6 +1439,9 @@ alexGetChar (AI loc ofs s)
 	adj_c 
 	  | c <= '\x06' = non_graphic
 	  | c <= '\xff' = c
+          -- Alex doesn't handle Unicode, so when Unicode
+          -- character is encoutered we output these values
+          -- with the actual character value hidden in the state.
 	  | otherwise = 
 		case generalCategory c of
 		  UppercaseLetter       -> upper
@@ -1488,7 +1515,7 @@ ipBit      = 6
 tvBit	   = 7	-- Scoped type variables enables 'forall' keyword
 bangPatBit = 8	-- Tells the parser to understand bang-patterns
 		-- (doesn't affect the lexer)
-idxTysBit  = 9	-- indexed type families: 'family' keyword and kind sigs
+tyFamBit   = 9	-- indexed type families: 'family' keyword and kind sigs
 haddockBit = 10 -- Lex and parse Haddock comments
 
 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
@@ -1500,7 +1527,7 @@ thEnabled      flags = testBit flags thBit
 ipEnabled      flags = testBit flags ipBit
 tvEnabled      flags = testBit flags tvBit
 bangPatEnabled flags = testBit flags bangPatBit
-idxTysEnabled  flags = testBit flags idxTysBit
+tyFamEnabled   flags = testBit flags tyFamBit
 haddockEnabled flags = testBit flags haddockBit
 
 -- PState for parsing options pragmas
@@ -1543,16 +1570,16 @@ mkPState buf loc flags  =
 	-- we begin in the layout state if toplev_layout is set
     }
     where
-      bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
-	       .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
-	       .|. parrBit    `setBitIf` dopt Opt_PArr        flags
-	       .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
-	       .|. thBit      `setBitIf` dopt Opt_TH          flags
+      bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts  flags
+	       .|. ffiBit     `setBitIf` dopt Opt_FFI          flags
+	       .|. parrBit    `setBitIf` dopt Opt_PArr         flags
+	       .|. arrowsBit  `setBitIf` dopt Opt_Arrows       flags
+	       .|. thBit      `setBitIf` dopt Opt_TH           flags
 	       .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
 	       .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
 	       .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
-	       .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
-	       .|. haddockBit `setBitIf` dopt Opt_Haddock     flags
+	       .|. tyFamBit   `setBitIf` dopt Opt_TypeFamilies flags
+	       .|. haddockBit `setBitIf` dopt Opt_Haddock      flags
       --
       setBitIf :: Int -> Bool -> Int
       b `setBitIf` cond | cond      = bit b