-- Note that Alex itself generates code with with some unused bindings and
-- without type signatures, so removing the flag might not be possible.
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
module Lexer (
Token(..), lexer, pragState, mkPState, PState(..),
P(..), ParseResult(..), getSrcLoc,
addWarning
) where
-#include "HsVersions.h"
-
import Bag
import ErrUtils
import Outputable
import Data.Ratio
import Debug.Trace
-#if __GLASGOW_HASKELL__ >= 605
-import Data.Char ( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#else
-import Compat.Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#endif
+import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
}
$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
-$whitechar = [\ \n\r\f\v\xa0 $unispace]
+$whitechar = [\ \n\r\f\v $unispace]
$white_no_nl = $whitechar # \n
$tab = \t
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
-$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
$symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
-$asclarge = [A-Z \xc0-\xd6 \xd8-\xde]
+$asclarge = [A-Z]
$large = [$asclarge $unilarge]
$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
-$ascsmall = [a-z \xdf-\xf6 \xf8-\xff]
+$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
-- NOTE: accept -} at the end of a LINE pragma, for compatibility
-- with older versions of GHC which generated these.
--- We only want RULES pragmas to be picked up when explicit forall
--- syntax is enabled is on, because the contents of the pragma always
--- uses it. If it's not on then we're sure to get a parse error.
--- (ToDo: we should really emit a warning when ignoring pragmas)
--- XXX Now that we can enable this without the -fglasgow-exts hammer,
--- is it better just to let the parse error happen?
-<0>
- "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
-
<0,option_prags> {
+ "{-#" $whitechar* (RULES|rules) { token ITrules_prag }
"{-#" $whitechar* (INLINE|inline) { token (ITinline_prag True) }
"{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
{ token (ITinline_prag False) }
$whitechar* (NO(T?)INLINE|no(t?)inline)
{ token (ITspec_inline_prag False) }
"{-#" $whitechar* (SOURCE|source) { token ITsource_prag }
+ "{-#" $whitechar* (WARNING|warning)
+ { token ITwarning_prag }
"{-#" $whitechar* (DEPRECATED|deprecated)
{ token ITdeprecated_prag }
"{-#" $whitechar* (SCC|scc) { token ITscc_prag }
"{-#" $whitechar* (CORE|core) { token ITcore_prag }
"{-#" $whitechar* (UNPACK|unpack) { token ITunpack_prag }
- "{-#" { nested_comment lexToken }
+ -- We ignore all these pragmas, but don't generate a warning for them
+ -- CFILES is a hugs-only thing.
+ "{-#" $whitechar* (OPTIONS_HUGS|options_hugs|OPTIONS_NHC98|options_nhc98|OPTIONS_JHC|options_jhc|CFILES|cfiles)
+ { nested_comment lexToken }
-- ToDo: should only be valid inside a pragma:
- "#-}" { token ITclose_prag}
+ "#-}" { endPrag }
}
<option_prags> {
}
<0> {
+ -- In the "0" mode we ignore these pragmas
+ "{-#" $whitechar* (OPTIONS|options|OPTIONS_GHC|options_ghc|OPTIONS_HADDOCK|options_haddock|LANGUAGE|language|INCLUDE|include)
+ { nested_comment lexToken }
+}
+
+<0> {
"-- #" .* ;
}
<0,option_prags> {
- -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
- "{-#" $whitechar* $idchar+ { nested_comment lexToken }
+ "{-#" { warnThen Opt_WarnUnrecognisedPragmas (text "Unrecognised pragma")
+ (nested_comment lexToken) }
}
-- '0' state: ordinary lexemes
-- when trying to be close to Haskell98
<0> {
-- Normal integral literals (:: Num a => a, from Integer)
- @decimal { tok_num positive 0 0 decimal }
- 0[oO] @octal { tok_num positive 2 2 octal }
- 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
+ @decimal { tok_num positive 0 0 decimal }
+ 0[oO] @octal { tok_num positive 2 2 octal }
+ 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
-- Normal rational literals (:: Fractional a => a, from Rational)
- @floating_point { strtoken tok_float }
+ @floating_point { strtoken tok_float }
}
<0> {
- -- Unboxed ints (:: Int#)
+ -- Unboxed ints (:: Int#) and words (:: Word#)
-- It's simpler (and faster?) to give separate cases to the negatives,
-- especially considering octal/hexadecimal prefixes.
- @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
- 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
- 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
- @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
- @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
+ @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
+ 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
+ 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
+ @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
+ @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
@negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
+ @decimal \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
+ 0[oO] @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
+ 0[xX] @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
+
-- Unboxed floats and doubles (:: Float#, :: Double#)
-- prim_{float,double} work with signed literals
@signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
| ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE)
| ITsource_prag
| ITrules_prag
+ | ITwarning_prag
| ITdeprecated_prag
| ITline_prag
| ITscc_prag
| ITvocurly
| ITvccurly
| ITobrack
- | ITopabrack -- [:, for parallel arrays with -fparr
- | ITcpabrack -- :], for parallel arrays with -fparr
+ | ITopabrack -- [:, for parallel arrays with -XParr
+ | ITcpabrack -- :], for parallel arrays with -XParr
| ITcbrack
| IToparen
| ITcparen
| ITprimchar Char
| ITprimstring FastString
| ITprimint Integer
+ | ITprimword Integer
| ITprimfloat Rational
| ITprimdouble Rational
( "where", ITwhere, 0 ),
( "_scc_", ITscc, 0 ), -- ToDo: remove
- ( "forall", ITforall, bit explicitForallBit),
+ ( "forall", ITforall, bit explicitForallBit .|. bit inRulePragBit),
( "mdo", ITmdo, bit recursiveDoBit),
( "family", ITfamily, bit tyFamBit),
( "group", ITgroup, bit transformComprehensionsBit),
-- For data T (a::*) = MkT
,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
-- For 'forall a . t'
- ,(".", ITdot, explicitForallEnabled)
+ ,(".", ITdot, \i -> explicitForallEnabled i || inRulePrag i)
,("-<", ITlarrowtail, arrowsEnabled)
,(">-", ITrarrowtail, arrowsEnabled)
Just (_, _) -> lexDocComment input (ITdocSection n) True
Nothing -> do setInput input; lexToken -- eof reached, lex it normally
+-- RULES pragmas turn on the forall and '.' keywords, and we turn them
+-- off again at the end of the pragma.
+rulePrag :: Action
+rulePrag span buf len = do
+ setExts (.|. inRulePragBit)
+ return (L span ITrules_prag)
+
+endPrag :: Action
+endPrag span buf len = do
+ setExts (.&. complement (bit inRulePragBit))
+ return (L span ITclose_prag)
+
-- docCommentEnd
-------------------------------------------------------------------------------
-- This function is quite tricky. We can't just return a new token, we also
-- some conveniences for use with tok_integral
tok_num = tok_integral ITinteger
tok_primint = tok_integral ITprimint
+tok_primword = tok_integral ITprimword positive
positive = id
negative = negate
decimal = (10,octDecDigit)
c | isAny c -> do setInput inp; return c
_other -> lit_error
-isAny c | c > '\xff' = isPrint c
+isAny c | c > '\x7f' = isPrint c
| otherwise = is_any c
lex_escape :: P Char
'x' -> readNum is_hexdigit 16 hexDigit
'o' -> readNum is_octdigit 8 octDecDigit
- x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
+ x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
c1 -> do
i <- getInput
addWarning option srcspan warning
lexToken
+warnThen :: DynFlag -> SDoc -> Action -> Action
+warnThen option warning action srcspan buf len = do
+ addWarning option srcspan warning
+ action srcspan buf len
+
-- -----------------------------------------------------------------------------
-- The Parse Monad
getExts :: P Int
getExts = P $ \s -> POk s (extsBitmap s)
+setExts :: (Int -> Int) -> P ()
+setExts f = P $ \s -> POk s{ extsBitmap = f (extsBitmap s) } ()
+
setSrcLoc :: SrcLoc -> P ()
setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
adj_c
| c <= '\x06' = non_graphic
- | c <= '\xff' = c
+ | c <= '\x7f' = c
-- Alex doesn't handle Unicode, so when Unicode
-- character is encoutered we output these values
-- with the actual character value hidden in the state.
LowercaseLetter -> lower
TitlecaseLetter -> upper
ModifierLetter -> other_graphic
- OtherLetter -> other_graphic
+ OtherLetter -> lower -- see #1103
NonSpacingMark -> other_graphic
SpacingCombiningMark -> other_graphic
EnclosingMark -> other_graphic
getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
-- for reasons of efficiency, flags indicating language extensions (eg,
--- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
+-- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
-- integer
genericsBit, ffiBit, parrBit :: Int
-- (doesn't affect the lexer)
tyFamBit = 9 -- indexed type families: 'family' keyword and kind sigs
haddockBit = 10 -- Lex and parse Haddock comments
-magicHashBit = 11 -- # in both functions and operators
+magicHashBit = 11 -- "#" in both functions and operators
kindSigsBit = 12 -- Kind signatures on type variables
recursiveDoBit = 13 -- mdo
unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
standaloneDerivingBit = 16 -- standalone instance deriving declarations
transformComprehensionsBit = 17
qqBit = 18 -- enable quasiquoting
+inRulePragBit = 19
genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
always _ = True
standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
qqEnabled flags = testBit flags qqBit
+inRulePrag flags = testBit flags inRulePragBit
-- PState for parsing options pragmas
--
-pragState :: StringBuffer -> SrcLoc -> PState
-pragState buf loc =
+pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
+pragState dynflags buf loc =
PState {
- buffer = buf,
+ buffer = buf,
messages = emptyMessages,
- -- XXX defaultDynFlags is not right, but we don't have a real
- -- dflags handy
- dflags = defaultDynFlags,
+ dflags = dynflags,
last_loc = mkSrcSpan loc loc,
last_offs = 0,
last_len = 0,
.|. qqBit `setBitIf` dopt Opt_QuasiQuotes flags
.|. ipBit `setBitIf` dopt Opt_ImplicitParams flags
.|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
+ .|. explicitForallBit `setBitIf` dopt Opt_LiberalTypeSynonyms flags
.|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
.|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
.|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
-> Message
srcParseErr buf len
= hcat [ if null token
- then ptext SLIT("parse error (possibly incorrect indentation)")
- else hcat [ptext SLIT("parse error on input "),
+ then ptext (sLit "parse error (possibly incorrect indentation)")
+ else hcat [ptext (sLit "parse error on input "),
char '`', text token, char '\'']
]
where token = lexemeToString (offsetBytes (-len) buf) len