X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=compiler%2Fparser%2FLexer.x;h=613848ade970d5c77e824f9c10b21b83779c8961;hb=36104d7a0d66df895c8275e3aa7cfe35a322ff04;hp=9ab4190b3d2a9dcc9cabee00c523d39b3f21e88b;hpb=fc621cd805289e81be950560fbfa9d30c298934c;p=ghc-hetmet.git diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index 9ab4190..613848a 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -31,6 +31,8 @@ -- Note that Alex itself generates code with with some unused bindings and -- without type signatures, so removing the flag might not be possible. +{-# OPTIONS_GHC -funbox-strict-fields #-} + module Lexer ( Token(..), lexer, pragState, mkPState, PState(..), P(..), ParseResult(..), getSrcLoc, @@ -39,7 +41,8 @@ module Lexer ( popContext, pushCurrentContext, setLastToken, setSrcLoc, getLexState, popLexState, pushLexState, extension, standaloneDerivingEnabled, bangPatEnabled, - addWarning + addWarning, + lexTokenStream ) where import Bag @@ -60,15 +63,11 @@ import Data.Char ( chr, ord, isSpace ) import Data.Ratio import Debug.Trace -#if __GLASGOW_HASKELL__ >= 605 -import Data.Char ( GeneralCategory(..), generalCategory, isPrint, isUpper ) -#else -import Compat.Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper ) -#endif +import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper ) } $unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar. -$whitechar = [\ \n\r\f\v\xa0 $unispace] +$whitechar = [\ \n\r\f\v $unispace] $white_no_nl = $whitechar # \n $tab = \t @@ -78,16 +77,16 @@ $decdigit = $ascdigit -- for now, should really be $digit (ToDo) $digit = [$ascdigit $unidigit] $special = [\(\)\,\;\[\]\`\{\}] -$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7] +$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~] $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar. $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\'] $unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar. -$asclarge = [A-Z \xc0-\xd6 \xd8-\xde] +$asclarge = [A-Z] $large = [$asclarge $unilarge] $unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar. -$ascsmall = [a-z \xdf-\xf6 \xf8-\xff] +$ascsmall = [a-z] $small = [$ascsmall $unismall \_] $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar. @@ -150,12 +149,12 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- space followed by a Haddock comment symbol (docsym) (in which case we'd -- have a Haddock comment). The rules then munch the rest of the line. -"-- " ~[$docsym \#] .* ; -"--" [^$symbol : \ ] .* ; +"-- " ~[$docsym \#] .* { lineCommentToken } +"--" [^$symbol : \ ] .* { lineCommentToken } -- Next, match Haddock comments if no -haddock flag -"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ; +"-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } { lineCommentToken } -- Now, when we've matched comments that begin with 2 dashes and continue -- with a different character, we need to match comments that begin with three @@ -163,17 +162,17 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- make sure that the first non-dash character isn't a symbol, and munch the -- rest of the line. -"---"\-* [^$symbol :] .* ; +"---"\-* [^$symbol :] .* { lineCommentToken } -- Since the previous rules all match dashes followed by at least one -- character, we also need to match a whole line filled with just dashes. -"--"\-* / { atEOL } ; +"--"\-* / { atEOL } { lineCommentToken } -- We need this rule since none of the other single line comment rules -- actually match this case. -"-- " / { atEOL } ; +"-- " / { atEOL } { lineCommentToken } -- 'bol' state: beginning of a line. Slurp up all the whitespace (including -- blank lines) until we find a non-whitespace character, then do layout @@ -229,16 +228,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- NOTE: accept -} at the end of a LINE pragma, for compatibility -- with older versions of GHC which generated these. --- We only want RULES pragmas to be picked up when explicit forall --- syntax is enabled is on, because the contents of the pragma always --- uses it. If it's not on then we're sure to get a parse error. --- (ToDo: we should really emit a warning when ignoring pragmas) --- XXX Now that we can enable this without the -fglasgow-exts hammer, --- is it better just to let the parse error happen? -<0> - "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag } - <0,option_prags> { + "{-#" $whitechar* (RULES|rules) { token ITrules_prag } "{-#" $whitechar* (INLINE|inline) { token (ITinline_prag True) } "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline) { token (ITinline_prag False) } @@ -250,6 +241,8 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } $whitechar* (NO(T?)INLINE|no(t?)inline) { token (ITspec_inline_prag False) } "{-#" $whitechar* (SOURCE|source) { token ITsource_prag } + "{-#" $whitechar* (WARNING|warning) + { token ITwarning_prag } "{-#" $whitechar* (DEPRECATED|deprecated) { token ITdeprecated_prag } "{-#" $whitechar* (SCC|scc) { token ITscc_prag } @@ -258,10 +251,13 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } "{-#" $whitechar* (CORE|core) { token ITcore_prag } "{-#" $whitechar* (UNPACK|unpack) { token ITunpack_prag } - "{-#" { nested_comment lexToken } + -- We ignore all these pragmas, but don't generate a warning for them + -- CFILES is a hugs-only thing. + "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles) + { nested_comment lexToken } -- ToDo: should only be valid inside a pragma: - "#-}" { token ITclose_prag} + "#-}" { endPrag } } { @@ -276,12 +272,18 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } } <0> { - "-- #" .* ; + -- In the "0" mode we ignore these pragmas + "{-#" $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include) + { nested_comment lexToken } +} + +<0> { + "-- #" .* { lineCommentToken } } <0,option_prags> { - -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ... - "{-#" $whitechar* $idchar+ { nested_comment lexToken } + "{-#" { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma") + (nested_comment lexToken) } } -- '0' state: ordinary lexemes @@ -376,25 +378,29 @@ $tab+ { warn Opt_WarnTabs (text "Tab character") } -- when trying to be close to Haskell98 <0> { -- Normal integral literals (:: Num a => a, from Integer) - @decimal { tok_num positive 0 0 decimal } - 0[oO] @octal { tok_num positive 2 2 octal } - 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal } + @decimal { tok_num positive 0 0 decimal } + 0[oO] @octal { tok_num positive 2 2 octal } + 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal } -- Normal rational literals (:: Fractional a => a, from Rational) - @floating_point { strtoken tok_float } + @floating_point { strtoken tok_float } } <0> { - -- Unboxed ints (:: Int#) + -- Unboxed ints (:: Int#) and words (:: Word#) -- It's simpler (and faster?) to give separate cases to the negatives, -- especially considering octal/hexadecimal prefixes. - @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal } - 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal } - 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal } - @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal } - @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal } + @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal } + 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal } + 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal } + @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal } + @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal } @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal } + @decimal \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal } + 0[oO] @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal } + 0[xX] @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal } + -- Unboxed floats and doubles (:: Float#, :: Double#) -- prim_{float,double} work with signed literals @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat } @@ -464,6 +470,7 @@ data Token | ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE) | ITsource_prag | ITrules_prag + | ITwarning_prag | ITdeprecated_prag | ITline_prag | ITscc_prag @@ -500,8 +507,8 @@ data Token | ITvocurly | ITvccurly | ITobrack - | ITopabrack -- [:, for parallel arrays with -fparr - | ITcpabrack -- :], for parallel arrays with -fparr + | ITopabrack -- [:, for parallel arrays with -XParr + | ITcpabrack -- :], for parallel arrays with -XParr | ITcbrack | IToparen | ITcparen @@ -533,6 +540,7 @@ data Token | ITprimchar Char | ITprimstring FastString | ITprimint Integer + | ITprimword Integer | ITprimfloat Rational | ITprimdouble Rational @@ -568,6 +576,8 @@ data Token | ITdocSection Int String -- a section heading | ITdocOptions String -- doc options (prune, ignore-exports, etc) | ITdocOptionsOld String -- doc options declared "-- # ..."-style + | ITlineComment String -- comment starting by "--" + | ITblockComment String -- comment in {- -} #ifdef DEBUG deriving Show -- debugging @@ -634,7 +644,7 @@ reservedWordsFM = listToUFM $ ( "where", ITwhere, 0 ), ( "_scc_", ITscc, 0 ), -- ToDo: remove - ( "forall", ITforall, bit explicitForallBit), + ( "forall", ITforall, bit explicitForallBit .|. bit inRulePragBit), ( "mdo", ITmdo, bit recursiveDoBit), ( "family", ITfamily, bit tyFamBit), ( "group", ITgroup, bit transformComprehensionsBit), @@ -677,7 +687,7 @@ reservedSymsFM = listToUFM $ -- For data T (a::*) = MkT ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i) -- For 'forall a . t' - ,(".", ITdot, explicitForallEnabled) + ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i) ,("-<", ITlarrowtail, arrowsEnabled) ,(">-", ITrarrowtail, arrowsEnabled) @@ -795,6 +805,11 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "") | otherwise -> input Nothing -> input +lineCommentToken :: Action +lineCommentToken span buf len = do + b <- extension rawTokenStreamEnabled + if b then strtoken ITlineComment span buf len else lexToken + {- nested comments require traversing by hand, they can't be parsed using regular expressions. @@ -802,20 +817,24 @@ multiline_doc_comment span buf _len = withLexedDocType (worker "") nested_comment :: P (Located Token) -> Action nested_comment cont span _str _len = do input <- getInput - go (1::Int) input + go "" (1::Int) input where - go 0 input = do setInput input; cont - go n input = case alexGetChar input of + go commentAcc 0 input = do setInput input + b <- extension rawTokenStreamEnabled + if b + then docCommentEnd input commentAcc ITblockComment _str span + else cont + go commentAcc n input = case alexGetChar input of Nothing -> errBrace input span Just ('-',input) -> case alexGetChar input of Nothing -> errBrace input span - Just ('\125',input) -> go (n-1) input - Just (_,_) -> go n input + Just ('\125',input) -> go commentAcc (n-1) input + Just (_,_) -> go ('-':commentAcc) n input Just ('\123',input) -> case alexGetChar input of Nothing -> errBrace input span - Just ('-',input) -> go (n+1) input - Just (_,_) -> go n input - Just (_,input) -> go n input + Just ('-',input) -> go ('-':'\123':commentAcc) (n+1) input + Just (_,_) -> go ('\123':commentAcc) n input + Just (c,input) -> go (c:commentAcc) n input nested_doc_comment :: Action nested_doc_comment span buf _len = withLexedDocType (go "") @@ -850,6 +869,18 @@ withLexedDocType lexDocComment = do Just (_, _) -> lexDocComment input (ITdocSection n) True Nothing -> do setInput input; lexToken -- eof reached, lex it normally +-- RULES pragmas turn on the forall and '.' keywords, and we turn them +-- off again at the end of the pragma. +rulePrag :: Action +rulePrag span buf len = do + setExts (.|. inRulePragBit) + return (L span ITrules_prag) + +endPrag :: Action +endPrag span buf len = do + setExts (.&. complement (bit inRulePragBit)) + return (L span ITclose_prag) + -- docCommentEnd ------------------------------------------------------------------------------- -- This function is quite tricky. We can't just return a new token, we also @@ -971,6 +1002,7 @@ tok_integral itint transint transbuf translen (radix,char_to_int) span buf len = -- some conveniences for use with tok_integral tok_num = tok_integral ITinteger tok_primint = tok_integral ITprimint +tok_primword = tok_integral ITprimword positive positive = id negative = negate decimal = (10,octDecDigit) @@ -1212,7 +1244,7 @@ lex_char c inp = do c | isAny c -> do setInput inp; return c _other -> lit_error -isAny c | c > '\xff' = isPrint c +isAny c | c > '\x7f' = isPrint c | otherwise = is_any c lex_escape :: P Char @@ -1236,7 +1268,7 @@ lex_escape = do 'x' -> readNum is_hexdigit 16 hexDigit 'o' -> readNum is_octdigit 8 octDecDigit - x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x) + x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x) c1 -> do i <- getInput @@ -1371,6 +1403,11 @@ warn option warning srcspan _buf _len = do addWarning option srcspan warning lexToken +warnThen :: DynFlag -> SDoc -> Action -> Action +warnThen option warning action srcspan buf len = do + addWarning option srcspan warning + action srcspan buf len + -- ----------------------------------------------------------------------------- -- The Parse Monad @@ -1442,6 +1479,9 @@ extension p = P $ \s -> POk s (p $! extsBitmap s) getExts :: P Int getExts = P $ \s -> POk s (extsBitmap s) +setExts :: (Int -> Int) -> P () +setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } () + setSrcLoc :: SrcLoc -> P () setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} () @@ -1480,7 +1520,7 @@ alexGetChar (AI loc ofs s) adj_c | c <= '\x06' = non_graphic - | c <= '\xff' = c + | c <= '\x7f' = c -- Alex doesn't handle Unicode, so when Unicode -- character is encoutered we output these values -- with the actual character value hidden in the state. @@ -1490,7 +1530,7 @@ alexGetChar (AI loc ofs s) LowercaseLetter -> lower TitlecaseLetter -> upper ModifierLetter -> other_graphic - OtherLetter -> other_graphic + OtherLetter -> lower -- see #1103 NonSpacingMark -> other_graphic SpacingCombiningMark -> other_graphic EnclosingMark -> other_graphic @@ -1544,7 +1584,7 @@ getLexState :: P Int getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls -- for reasons of efficiency, flags indicating language extensions (eg, --- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed +-- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed -- integer genericsBit, ffiBit, parrBit :: Int @@ -1559,7 +1599,7 @@ bangPatBit = 8 -- Tells the parser to understand bang-patterns -- (doesn't affect the lexer) tyFamBit = 9 -- indexed type families: 'family' keyword and kind sigs haddockBit = 10 -- Lex and parse Haddock comments -magicHashBit = 11 -- # in both functions and operators +magicHashBit = 11 -- "#" in both functions and operators kindSigsBit = 12 -- Kind signatures on type variables recursiveDoBit = 13 -- mdo unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc @@ -1567,6 +1607,8 @@ unboxedTuplesBit = 15 -- (# and #) standaloneDerivingBit = 16 -- standalone instance deriving declarations transformComprehensionsBit = 17 qqBit = 18 -- enable quasiquoting +inRulePragBit = 19 +rawTokenStreamBit = 20 -- producing a token stream with all comments included genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool always _ = True @@ -1588,17 +1630,17 @@ unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit qqEnabled flags = testBit flags qqBit +inRulePrag flags = testBit flags inRulePragBit +rawTokenStreamEnabled flags = testBit flags rawTokenStreamBit -- PState for parsing options pragmas -- -pragState :: StringBuffer -> SrcLoc -> PState -pragState buf loc = +pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState +pragState dynflags buf loc = PState { - buffer = buf, + buffer = buf, messages = emptyMessages, - -- XXX defaultDynFlags is not right, but we don't have a real - -- dflags handy - dflags = defaultDynFlags, + dflags = dynflags, last_loc = mkSrcSpan loc loc, last_offs = 0, last_len = 0, @@ -1637,6 +1679,7 @@ mkPState buf loc flags = .|. qqBit `setBitIf` dopt Opt_QuasiQuotes flags .|. ipBit `setBitIf` dopt Opt_ImplicitParams flags .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags + .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags @@ -1650,7 +1693,8 @@ mkPState buf loc flags = .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags - .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags + .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags + .|. rawTokenStreamBit `setBitIf` dopt Opt_KeepRawTokenStream flags -- setBitIf :: Int -> Bool -> Int b `setBitIf` cond | cond = bit b @@ -1766,4 +1810,13 @@ reportLexError loc1 loc2 buf str if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar# then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)") else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c) + +lexTokenStream :: StringBuffer -> SrcLoc -> DynFlags -> ParseResult [Located Token] +lexTokenStream buf loc dflags = unP go initState + where initState = mkPState buf loc (dopt_set (dopt_unset dflags Opt_Haddock) Opt_KeepRawTokenStream) + go = do + ltok <- lexer return + case ltok of + L _ ITeof -> return [] + _ -> liftM (ltok:) go }