-- Note that Alex itself generates code with with some unused bindings and
-- without type signatures, so removing the flag might not be possible.
+{-# OPTIONS_GHC -funbox-strict-fields #-}
+
module Lexer (
Token(..), lexer, pragState, mkPState, PState(..),
P(..), ParseResult(..), getSrcLoc,
addWarning
) where
-#include "HsVersions.h"
-
import Bag
import ErrUtils
import Outputable
import Data.Ratio
import Debug.Trace
-#if __GLASGOW_HASKELL__ >= 605
-import Data.Char ( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#else
-import Compat.Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
-#endif
+import Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
}
$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
-$whitechar = [\ \n\r\f\v\xa0 $unispace]
+$whitechar = [\ \n\r\f\v $unispace]
$white_no_nl = $whitechar # \n
$tab = \t
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
-$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
+$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
$unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
$symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
$unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
-$asclarge = [A-Z \xc0-\xd6 \xd8-\xde]
+$asclarge = [A-Z]
$large = [$asclarge $unilarge]
$unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
-$ascsmall = [a-z \xdf-\xf6 \xf8-\xff]
+$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
$unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
$whitechar* (NO(T?)INLINE|no(t?)inline)
{ token (ITspec_inline_prag False) }
"{-#" $whitechar* (SOURCE|source) { token ITsource_prag }
+ "{-#" $whitechar* (WARNING|warning)
+ { token ITwarning_prag }
"{-#" $whitechar* (DEPRECATED|deprecated)
{ token ITdeprecated_prag }
"{-#" $whitechar* (SCC|scc) { token ITscc_prag }
-- when trying to be close to Haskell98
<0> {
-- Normal integral literals (:: Num a => a, from Integer)
- @decimal { tok_num positive 0 0 decimal }
- 0[oO] @octal { tok_num positive 2 2 octal }
- 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
+ @decimal { tok_num positive 0 0 decimal }
+ 0[oO] @octal { tok_num positive 2 2 octal }
+ 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
-- Normal rational literals (:: Fractional a => a, from Rational)
- @floating_point { strtoken tok_float }
+ @floating_point { strtoken tok_float }
}
<0> {
- -- Unboxed ints (:: Int#)
+ -- Unboxed ints (:: Int#) and words (:: Word#)
-- It's simpler (and faster?) to give separate cases to the negatives,
-- especially considering octal/hexadecimal prefixes.
- @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
- 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
- 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
- @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
- @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
+ @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
+ 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
+ 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
+ @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
+ @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
@negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
+ @decimal \# \# / { ifExtension magicHashEnabled } { tok_primword 0 2 decimal }
+ 0[oO] @octal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 octal }
+ 0[xX] @hexadecimal \# \# / { ifExtension magicHashEnabled } { tok_primword 2 4 hexadecimal }
+
-- Unboxed floats and doubles (:: Float#, :: Double#)
-- prim_{float,double} work with signed literals
@signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
| ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE)
| ITsource_prag
| ITrules_prag
+ | ITwarning_prag
| ITdeprecated_prag
| ITline_prag
| ITscc_prag
| ITvocurly
| ITvccurly
| ITobrack
- | ITopabrack -- [:, for parallel arrays with -fparr
- | ITcpabrack -- :], for parallel arrays with -fparr
+ | ITopabrack -- [:, for parallel arrays with -XParr
+ | ITcpabrack -- :], for parallel arrays with -XParr
| ITcbrack
| IToparen
| ITcparen
| ITprimchar Char
| ITprimstring FastString
| ITprimint Integer
+ | ITprimword Integer
| ITprimfloat Rational
| ITprimdouble Rational
-- some conveniences for use with tok_integral
tok_num = tok_integral ITinteger
tok_primint = tok_integral ITprimint
+tok_primword = tok_integral ITprimword positive
positive = id
negative = negate
decimal = (10,octDecDigit)
c | isAny c -> do setInput inp; return c
_other -> lit_error
-isAny c | c > '\xff' = isPrint c
+isAny c | c > '\x7f' = isPrint c
| otherwise = is_any c
lex_escape :: P Char
'x' -> readNum is_hexdigit 16 hexDigit
'o' -> readNum is_octdigit 8 octDecDigit
- x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
+ x | is_decdigit x -> readNum2 is_decdigit 10 octDecDigit (octDecDigit x)
c1 -> do
i <- getInput
adj_c
| c <= '\x06' = non_graphic
- | c <= '\xff' = c
+ | c <= '\x7f' = c
-- Alex doesn't handle Unicode, so when Unicode
-- character is encoutered we output these values
-- with the actual character value hidden in the state.
LowercaseLetter -> lower
TitlecaseLetter -> upper
ModifierLetter -> other_graphic
- OtherLetter -> other_graphic
+ OtherLetter -> lower -- see #1103
NonSpacingMark -> other_graphic
SpacingCombiningMark -> other_graphic
EnclosingMark -> other_graphic
getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
-- for reasons of efficiency, flags indicating language extensions (eg,
--- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
+-- -fglasgow-exts or -XParr) are represented by a bitmap stored in an unboxed
-- integer
genericsBit, ffiBit, parrBit :: Int
-- (doesn't affect the lexer)
tyFamBit = 9 -- indexed type families: 'family' keyword and kind sigs
haddockBit = 10 -- Lex and parse Haddock comments
-magicHashBit = 11 -- # in both functions and operators
+magicHashBit = 11 -- "#" in both functions and operators
kindSigsBit = 12 -- Kind signatures on type variables
recursiveDoBit = 13 -- mdo
unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
-- PState for parsing options pragmas
--
-pragState :: StringBuffer -> SrcLoc -> PState
-pragState buf loc =
+pragState :: DynFlags -> StringBuffer -> SrcLoc -> PState
+pragState dynflags buf loc =
PState {
- buffer = buf,
+ buffer = buf,
messages = emptyMessages,
- -- XXX defaultDynFlags is not right, but we don't have a real
- -- dflags handy
- dflags = defaultDynFlags,
+ dflags = dynflags,
last_loc = mkSrcSpan loc loc,
last_offs = 0,
last_len = 0,
-> Message
srcParseErr buf len
= hcat [ if null token
- then ptext SLIT("parse error (possibly incorrect indentation)")
- else hcat [ptext SLIT("parse error on input "),
+ then ptext (sLit "parse error (possibly incorrect indentation)")
+ else hcat [ptext (sLit "parse error on input "),
char '`', text token, char '\'']
]
where token = lexemeToString (offsetBytes (-len) buf) len