1 -----------------------------------------------------------------------------
2 -- (c) The University of Glasgow, 2006
6 -- This is a combination of an Alex-generated lexer from a regex
7 -- definition, with some hand-coded bits.
9 -- Completely accurate information about token-spans within the source
10 -- file is maintained. Every token has a start and end SrcLoc attached to it.
12 -----------------------------------------------------------------------------
16 -- - parsing integers is a bit slow
17 -- - readRational is a bit slow
19 -- Known bugs, that were also in the previous version:
20 -- - M... should be 3 tokens, not 1.
21 -- - pragma-end should be only valid in a pragma
25 -- The above warning supression flag is a temporary kludge.
26 -- While working on this module you are encouraged to remove it and fix
27 -- any warnings in the module. See
28 -- http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
31 -- Note that Alex itself generates code with with some unused bindings and
32 -- without type signatures, so removing the flag might not be possible.
35 Token(..), lexer, pragState, mkPState, PState(..),
36 P(..), ParseResult(..), getSrcLoc,
37 failLocMsgP, failSpanMsgP, srcParseFail,
39 popContext, pushCurrentContext, setLastToken, setSrcLoc,
40 getLexState, popLexState, pushLexState,
41 extension, standaloneDerivingEnabled, bangPatEnabled,
55 import Util ( maybePrefixMatch, readRational )
59 import Data.Char ( chr, ord, isSpace )
63 #if __GLASGOW_HASKELL__ >= 605
64 import Data.Char ( GeneralCategory(..), generalCategory, isPrint, isUpper )
66 import Compat.Unicode ( GeneralCategory(..), generalCategory, isPrint, isUpper )
70 $unispace = \x05 -- Trick Alex into handling Unicode. See alexGetChar.
71 $whitechar = [\ \n\r\f\v\xa0 $unispace]
72 $white_no_nl = $whitechar # \n
76 $unidigit = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
77 $decdigit = $ascdigit -- for now, should really be $digit (ToDo)
78 $digit = [$ascdigit $unidigit]
80 $special = [\(\)\,\;\[\]\`\{\}]
81 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
82 $unisymbol = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
83 $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
85 $unilarge = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
86 $asclarge = [A-Z \xc0-\xd6 \xd8-\xde]
87 $large = [$asclarge $unilarge]
89 $unismall = \x02 -- Trick Alex into handling Unicode. See alexGetChar.
90 $ascsmall = [a-z \xdf-\xf6 \xf8-\xff]
91 $small = [$ascsmall $unismall \_]
93 $unigraphic = \x06 -- Trick Alex into handling Unicode. See alexGetChar.
94 $graphic = [$small $large $symbol $digit $special $unigraphic \:\"\']
97 $hexit = [$decdigit A-F a-f]
98 $symchar = [$symbol \:]
100 $idchar = [$small $large $digit \']
102 $docsym = [\| \^ \* \$]
104 @varid = $small $idchar*
105 @conid = $large $idchar*
107 @varsym = $symbol $symchar*
108 @consym = \: $symchar*
110 @decimal = $decdigit+
112 @hexadecimal = $hexit+
113 @exponent = [eE] [\-\+]? @decimal
115 -- we support the hierarchical module name extension:
118 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
120 -- normal signed numerical literals can only be explicitly negative,
121 -- not explicitly positive (contrast @exponent)
123 @signed = @negative ?
127 -- everywhere: skip whitespace and comments
129 $tab+ { warn Opt_WarnTabs (text "Tab character") }
131 -- Everywhere: deal with nested comments. We explicitly rule out
132 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
133 -- (this can happen even though pragmas will normally take precedence due to
134 -- longest-match, because pragmas aren't valid in every state, but comments
135 -- are). We also rule out nested Haddock comments, if the -haddock flag is
138 "{-" / { isNormalComment } { nested_comment lexToken }
140 -- Single-line comments are a bit tricky. Haskell 98 says that two or
141 -- more dashes followed by a symbol should be parsed as a varsym, so we
142 -- have to exclude those.
144 -- Since Haddock comments aren't valid in every state, we need to rule them
147 -- The following two rules match comments that begin with two dashes, but
148 -- continue with a different character. The rules test that this character
149 -- is not a symbol (in which case we'd have a varsym), and that it's not a
150 -- space followed by a Haddock comment symbol (docsym) (in which case we'd
151 -- have a Haddock comment). The rules then munch the rest of the line.
153 "-- " ~[$docsym \#] .* ;
154 "--" [^$symbol : \ ] .* ;
156 -- Next, match Haddock comments if no -haddock flag
158 "-- " [$docsym \#] .* / { ifExtension (not . haddockEnabled) } ;
160 -- Now, when we've matched comments that begin with 2 dashes and continue
161 -- with a different character, we need to match comments that begin with three
162 -- or more dashes (which clearly can't be Haddock comments). We only need to
163 -- make sure that the first non-dash character isn't a symbol, and munch the
166 "---"\-* [^$symbol :] .* ;
168 -- Since the previous rules all match dashes followed by at least one
169 -- character, we also need to match a whole line filled with just dashes.
171 "--"\-* / { atEOL } ;
173 -- We need this rule since none of the other single line comment rules
174 -- actually match this case.
178 -- 'bol' state: beginning of a line. Slurp up all the whitespace (including
179 -- blank lines) until we find a non-whitespace character, then do layout
182 -- One slight wibble here: what if the line begins with {-#? In
183 -- theory, we have to lex the pragma to see if it's one we recognise,
184 -- and if it is, then we backtrack and do_bol, otherwise we treat it
185 -- as a nested comment. We don't bother with this: if the line begins
186 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
189 ^\# (line)? { begin line_prag1 }
190 ^\# pragma .* \n ; -- GCC 3.3 CPP generated, apparently
191 ^\# \! .* \n ; -- #!, for scripts
195 -- after a layout keyword (let, where, do, of), we begin a new layout
196 -- context if the curly brace is missing.
197 -- Careful! This stuff is quite delicate.
198 <layout, layout_do> {
199 \{ / { notFollowedBy '-' } { pop_and open_brace }
200 -- we might encounter {-# here, but {- has been handled already
202 ^\# (line)? { begin line_prag1 }
205 -- do is treated in a subtly different way, see new_layout_context
206 <layout> () { new_layout_context True }
207 <layout_do> () { new_layout_context False }
209 -- after a new layout context which was found to be to the left of the
210 -- previous context, we have generated a '{' token, and we now need to
211 -- generate a matching '}' token.
212 <layout_left> () { do_layout_left }
214 <0,option_prags> \n { begin bol }
216 "{-#" $whitechar* (line|LINE) { begin line_prag2 }
218 -- single-line line pragmas, of the form
219 -- # <line> "<file>" <extra-stuff> \n
220 <line_prag1> $decdigit+ { setLine line_prag1a }
221 <line_prag1a> \" [$graphic \ ]* \" { setFile line_prag1b }
222 <line_prag1b> .* { pop }
224 -- Haskell-style line pragmas, of the form
225 -- {-# LINE <line> "<file>" #-}
226 <line_prag2> $decdigit+ { setLine line_prag2a }
227 <line_prag2a> \" [$graphic \ ]* \" { setFile line_prag2b }
228 <line_prag2b> "#-}"|"-}" { pop }
229 -- NOTE: accept -} at the end of a LINE pragma, for compatibility
230 -- with older versions of GHC which generated these.
232 -- We only want RULES pragmas to be picked up when explicit forall
233 -- syntax is enabled is on, because the contents of the pragma always
234 -- uses it. If it's not on then we're sure to get a parse error.
235 -- (ToDo: we should really emit a warning when ignoring pragmas)
236 -- XXX Now that we can enable this without the -fglasgow-exts hammer,
237 -- is it better just to let the parse error happen?
239 "{-#" $whitechar* (RULES|rules) / { ifExtension explicitForallEnabled } { token ITrules_prag }
242 "{-#" $whitechar* (INLINE|inline) { token (ITinline_prag True) }
243 "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
244 { token (ITinline_prag False) }
245 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
246 { token ITspec_prag }
247 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
248 $whitechar* (INLINE|inline) { token (ITspec_inline_prag True) }
249 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
250 $whitechar* (NO(T?)INLINE|no(t?)inline)
251 { token (ITspec_inline_prag False) }
252 "{-#" $whitechar* (SOURCE|source) { token ITsource_prag }
253 "{-#" $whitechar* (DEPRECATED|deprecated)
254 { token ITdeprecated_prag }
255 "{-#" $whitechar* (SCC|scc) { token ITscc_prag }
256 "{-#" $whitechar* (GENERATED|generated)
257 { token ITgenerated_prag }
258 "{-#" $whitechar* (CORE|core) { token ITcore_prag }
259 "{-#" $whitechar* (UNPACK|unpack) { token ITunpack_prag }
261 "{-#" { nested_comment lexToken }
263 -- ToDo: should only be valid inside a pragma:
264 "#-}" { token ITclose_prag}
268 "{-#" $whitechar* (OPTIONS|options) { lex_string_prag IToptions_prag }
269 "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
270 { lex_string_prag IToptions_prag }
271 "{-#" $whitechar* (OPTIONS_HADDOCK|options_haddock)
272 { lex_string_prag ITdocOptions }
273 "-- #" { multiline_doc_comment }
274 "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
275 "{-#" $whitechar* (INCLUDE|include) { lex_string_prag ITinclude_prag }
283 -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
284 "{-#" $whitechar* $idchar+ { nested_comment lexToken }
287 -- '0' state: ordinary lexemes
292 "-- " $docsym / { ifExtension haddockEnabled } { multiline_doc_comment }
293 "{-" \ ? $docsym / { ifExtension haddockEnabled } { nested_doc_comment }
299 "[:" / { ifExtension parrEnabled } { token ITopabrack }
300 ":]" / { ifExtension parrEnabled } { token ITcpabrack }
304 "[|" / { ifExtension thEnabled } { token ITopenExpQuote }
305 "[e|" / { ifExtension thEnabled } { token ITopenExpQuote }
306 "[p|" / { ifExtension thEnabled } { token ITopenPatQuote }
307 "[d|" / { ifExtension thEnabled } { layout_token ITopenDecQuote }
308 "[t|" / { ifExtension thEnabled } { token ITopenTypQuote }
309 "|]" / { ifExtension thEnabled } { token ITcloseQuote }
310 \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
311 "$(" / { ifExtension thEnabled } { token ITparenEscape }
313 "[$" @varid "|" / { ifExtension qqEnabled }
314 { lex_quasiquote_tok }
318 "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
319 { special IToparenbar }
320 "|)" / { ifExtension arrowsEnabled } { special ITcparenbar }
324 \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
328 "(#" / { ifExtension unboxedTuplesEnabled `alexAndPred` notFollowedBySymbol }
329 { token IToubxparen }
330 "#)" / { ifExtension unboxedTuplesEnabled }
331 { token ITcubxparen }
335 "{|" / { ifExtension genericsEnabled } { token ITocurlybar }
336 "|}" / { ifExtension genericsEnabled } { token ITccurlybar }
340 \( { special IToparen }
341 \) { special ITcparen }
342 \[ { special ITobrack }
343 \] { special ITcbrack }
344 \, { special ITcomma }
345 \; { special ITsemi }
346 \` { special ITbackquote }
353 @qual @varid { idtoken qvarid }
354 @qual @conid { idtoken qconid }
356 @conid { idtoken conid }
360 @qual @varid "#"+ / { ifExtension magicHashEnabled } { idtoken qvarid }
361 @qual @conid "#"+ / { ifExtension magicHashEnabled } { idtoken qconid }
362 @varid "#"+ / { ifExtension magicHashEnabled } { varid }
363 @conid "#"+ / { ifExtension magicHashEnabled } { idtoken conid }
369 @qual @varsym { idtoken qvarsym }
370 @qual @consym { idtoken qconsym }
375 -- For the normal boxed literals we need to be careful
376 -- when trying to be close to Haskell98
378 -- Normal integral literals (:: Num a => a, from Integer)
379 @decimal { tok_num positive 0 0 decimal }
380 0[oO] @octal { tok_num positive 2 2 octal }
381 0[xX] @hexadecimal { tok_num positive 2 2 hexadecimal }
383 -- Normal rational literals (:: Fractional a => a, from Rational)
384 @floating_point { strtoken tok_float }
388 -- Unboxed ints (:: Int#)
389 -- It's simpler (and faster?) to give separate cases to the negatives,
390 -- especially considering octal/hexadecimal prefixes.
391 @decimal \# / { ifExtension magicHashEnabled } { tok_primint positive 0 1 decimal }
392 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 octal }
393 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint positive 2 3 hexadecimal }
394 @negative @decimal \# / { ifExtension magicHashEnabled } { tok_primint negative 1 2 decimal }
395 @negative 0[oO] @octal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 octal }
396 @negative 0[xX] @hexadecimal \# / { ifExtension magicHashEnabled } { tok_primint negative 3 4 hexadecimal }
398 -- Unboxed floats and doubles (:: Float#, :: Double#)
399 -- prim_{float,double} work with signed literals
400 @signed @floating_point \# / { ifExtension magicHashEnabled } { init_strtoken 1 tok_primfloat }
401 @signed @floating_point \# \# / { ifExtension magicHashEnabled } { init_strtoken 2 tok_primdouble }
404 -- Strings and chars are lexed by hand-written code. The reason is
405 -- that even if we recognise the string or char here in the regex
406 -- lexer, we would still have to parse the string afterward in order
407 -- to convert it to a String.
410 \" { lex_string_tok }
414 -- -----------------------------------------------------------------------------
418 = ITas -- Haskell keywords
442 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
444 | ITforall -- GHC extension keywords
462 | ITinline_prag Bool -- True <=> INLINE, False <=> NOINLINE
463 | ITspec_prag -- SPECIALISE
464 | ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE)
471 | ITcore_prag -- hdaume: core annotations
474 | IToptions_prag String
475 | ITinclude_prag String
478 | ITdotdot -- reserved symbols
494 | ITbiglam -- GHC-extension symbols
496 | ITocurly -- special symbols
498 | ITocurlybar -- {|, for type applications
499 | ITccurlybar -- |}, for type applications
503 | ITopabrack -- [:, for parallel arrays with -fparr
504 | ITcpabrack -- :], for parallel arrays with -fparr
515 | ITvarid FastString -- identifiers
517 | ITvarsym FastString
518 | ITconsym FastString
519 | ITqvarid (FastString,FastString)
520 | ITqconid (FastString,FastString)
521 | ITqvarsym (FastString,FastString)
522 | ITqconsym (FastString,FastString)
524 | ITdupipvarid FastString -- GHC extension: implicit param: ?x
526 | ITpragma StringBuffer
529 | ITstring FastString
531 | ITrational Rational
534 | ITprimstring FastString
536 | ITprimfloat Rational
537 | ITprimdouble Rational
539 -- MetaHaskell extension tokens
540 | ITopenExpQuote -- [| or [e|
541 | ITopenPatQuote -- [p|
542 | ITopenDecQuote -- [d|
543 | ITopenTypQuote -- [t|
545 | ITidEscape FastString -- $x
546 | ITparenEscape -- $(
549 | ITquasiQuote (FastString,FastString,SrcSpan) -- [:...|...|]
551 -- Arrow notation extension
558 | ITLarrowtail -- -<<
559 | ITRarrowtail -- >>-
561 | ITunknown String -- Used when the lexer can't make sense of it
562 | ITeof -- end of file token
564 -- Documentation annotations
565 | ITdocCommentNext String -- something beginning '-- |'
566 | ITdocCommentPrev String -- something beginning '-- ^'
567 | ITdocCommentNamed String -- something beginning '-- $'
568 | ITdocSection Int String -- a section heading
569 | ITdocOptions String -- doc options (prune, ignore-exports, etc)
570 | ITdocOptionsOld String -- doc options declared "-- # ..."-style
573 deriving Show -- debugging
577 isSpecial :: Token -> Bool
578 -- If we see M.x, where x is a keyword, but
579 -- is special, we treat is as just plain M.x,
581 isSpecial ITas = True
582 isSpecial IThiding = True
583 isSpecial ITqualified = True
584 isSpecial ITforall = True
585 isSpecial ITexport = True
586 isSpecial ITlabel = True
587 isSpecial ITdynamic = True
588 isSpecial ITsafe = True
589 isSpecial ITthreadsafe = True
590 isSpecial ITunsafe = True
591 isSpecial ITccallconv = True
592 isSpecial ITstdcallconv = True
593 isSpecial ITmdo = True
594 isSpecial ITfamily = True
595 isSpecial ITgroup = True
596 isSpecial ITby = True
597 isSpecial ITusing = True
601 -- the bitmap provided as the third component indicates whether the
602 -- corresponding extension keyword is valid under the extension options
603 -- provided to the compiler; if the extension corresponding to *any* of the
604 -- bits set in the bitmap is enabled, the keyword is valid (this setup
605 -- facilitates using a keyword in two different extensions that can be
606 -- activated independently)
608 reservedWordsFM = listToUFM $
609 map (\(x, y, z) -> (mkFastString x, (y, z)))
610 [( "_", ITunderscore, 0 ),
612 ( "case", ITcase, 0 ),
613 ( "class", ITclass, 0 ),
614 ( "data", ITdata, 0 ),
615 ( "default", ITdefault, 0 ),
616 ( "deriving", ITderiving, 0 ),
618 ( "else", ITelse, 0 ),
619 ( "hiding", IThiding, 0 ),
621 ( "import", ITimport, 0 ),
623 ( "infix", ITinfix, 0 ),
624 ( "infixl", ITinfixl, 0 ),
625 ( "infixr", ITinfixr, 0 ),
626 ( "instance", ITinstance, 0 ),
628 ( "module", ITmodule, 0 ),
629 ( "newtype", ITnewtype, 0 ),
631 ( "qualified", ITqualified, 0 ),
632 ( "then", ITthen, 0 ),
633 ( "type", ITtype, 0 ),
634 ( "where", ITwhere, 0 ),
635 ( "_scc_", ITscc, 0 ), -- ToDo: remove
637 ( "forall", ITforall, bit explicitForallBit),
638 ( "mdo", ITmdo, bit recursiveDoBit),
639 ( "family", ITfamily, bit tyFamBit),
640 ( "group", ITgroup, bit transformComprehensionsBit),
641 ( "by", ITby, bit transformComprehensionsBit),
642 ( "using", ITusing, bit transformComprehensionsBit),
644 ( "foreign", ITforeign, bit ffiBit),
645 ( "export", ITexport, bit ffiBit),
646 ( "label", ITlabel, bit ffiBit),
647 ( "dynamic", ITdynamic, bit ffiBit),
648 ( "safe", ITsafe, bit ffiBit),
649 ( "threadsafe", ITthreadsafe, bit ffiBit),
650 ( "unsafe", ITunsafe, bit ffiBit),
651 ( "stdcall", ITstdcallconv, bit ffiBit),
652 ( "ccall", ITccallconv, bit ffiBit),
653 ( "dotnet", ITdotnet, bit ffiBit),
655 ( "rec", ITrec, bit arrowsBit),
656 ( "proc", ITproc, bit arrowsBit)
659 reservedSymsFM :: UniqFM (Token, Int -> Bool)
660 reservedSymsFM = listToUFM $
661 map (\ (x,y,z) -> (mkFastString x,(y,z)))
662 [ ("..", ITdotdot, always)
663 -- (:) is a reserved op, meaning only list cons
664 ,(":", ITcolon, always)
665 ,("::", ITdcolon, always)
666 ,("=", ITequal, always)
667 ,("\\", ITlam, always)
668 ,("|", ITvbar, always)
669 ,("<-", ITlarrow, always)
670 ,("->", ITrarrow, always)
672 ,("~", ITtilde, always)
673 ,("=>", ITdarrow, always)
674 ,("-", ITminus, always)
675 ,("!", ITbang, always)
677 -- For data T (a::*) = MkT
678 ,("*", ITstar, \i -> kindSigsEnabled i || tyFamEnabled i)
679 -- For 'forall a . t'
680 ,(".", ITdot, explicitForallEnabled)
682 ,("-<", ITlarrowtail, arrowsEnabled)
683 ,(">-", ITrarrowtail, arrowsEnabled)
684 ,("-<<", ITLarrowtail, arrowsEnabled)
685 ,(">>-", ITRarrowtail, arrowsEnabled)
687 #if __GLASGOW_HASKELL__ >= 605
688 ,("∷", ITdcolon, unicodeSyntaxEnabled)
689 ,("⇒", ITdarrow, unicodeSyntaxEnabled)
690 ,("∀", ITforall, \i -> unicodeSyntaxEnabled i &&
691 explicitForallEnabled i)
692 ,("→", ITrarrow, unicodeSyntaxEnabled)
693 ,("←", ITlarrow, unicodeSyntaxEnabled)
694 ,("⋯", ITdotdot, unicodeSyntaxEnabled)
695 -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
696 -- form part of a large operator. This would let us have a better
697 -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
701 -- -----------------------------------------------------------------------------
704 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
706 special :: Token -> Action
707 special tok span _buf _len = return (L span tok)
709 token, layout_token :: Token -> Action
710 token t span _buf _len = return (L span t)
711 layout_token t span _buf _len = pushLexState layout >> return (L span t)
713 idtoken :: (StringBuffer -> Int -> Token) -> Action
714 idtoken f span buf len = return (L span $! (f buf len))
716 skip_one_varid :: (FastString -> Token) -> Action
717 skip_one_varid f span buf len
718 = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
720 strtoken :: (String -> Token) -> Action
721 strtoken f span buf len =
722 return (L span $! (f $! lexemeToString buf len))
724 init_strtoken :: Int -> (String -> Token) -> Action
725 -- like strtoken, but drops the last N character(s)
726 init_strtoken drop f span buf len =
727 return (L span $! (f $! lexemeToString buf (len-drop)))
729 begin :: Int -> Action
730 begin code _span _str _len = do pushLexState code; lexToken
733 pop _span _buf _len = do popLexState; lexToken
735 pop_and :: Action -> Action
736 pop_and act span buf len = do popLexState; act span buf len
738 {-# INLINE nextCharIs #-}
739 nextCharIs buf p = not (atEnd buf) && p (currentChar buf)
741 notFollowedBy char _ _ _ (AI _ _ buf)
742 = nextCharIs buf (/=char)
744 notFollowedBySymbol _ _ _ (AI _ _ buf)
745 = nextCharIs buf (`notElem` "!#$%&*+./<=>?@\\^|-~")
747 -- We must reject doc comments as being ordinary comments everywhere.
748 -- In some cases the doc comment will be selected as the lexeme due to
749 -- maximal munch, but not always, because the nested comment rule is
750 -- valid in all states, but the doc-comment rules are only valid in
751 -- the non-layout states.
752 isNormalComment bits _ _ (AI _ _ buf)
753 | haddockEnabled bits = notFollowedByDocOrPragma
754 | otherwise = nextCharIs buf (/='#')
756 notFollowedByDocOrPragma
757 = not $ spaceAndP buf (`nextCharIs` (`elem` "|^*$#"))
759 spaceAndP buf p = p buf || nextCharIs buf (==' ') && p (snd (nextChar buf))
762 haddockDisabledAnd p bits _ _ (AI _ _ buf)
763 = if haddockEnabled bits then False else (p buf)
766 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
768 ifExtension pred bits _ _ _ = pred bits
770 multiline_doc_comment :: Action
771 multiline_doc_comment span buf _len = withLexedDocType (worker "")
773 worker commentAcc input docType oneLine = case alexGetChar input of
775 | oneLine -> docCommentEnd input commentAcc docType buf span
776 | otherwise -> case checkIfCommentLine input' of
777 Just input -> worker ('\n':commentAcc) input docType False
778 Nothing -> docCommentEnd input commentAcc docType buf span
779 Just (c, input) -> worker (c:commentAcc) input docType oneLine
780 Nothing -> docCommentEnd input commentAcc docType buf span
782 checkIfCommentLine input = check (dropNonNewlineSpace input)
784 check input = case alexGetChar input of
785 Just ('-', input) -> case alexGetChar input of
786 Just ('-', input) -> case alexGetChar input of
787 Just (c, _) | c /= '-' -> Just input
792 dropNonNewlineSpace input = case alexGetChar input of
794 | isSpace c && c /= '\n' -> dropNonNewlineSpace input'
799 nested comments require traversing by hand, they can't be parsed
800 using regular expressions.
802 nested_comment :: P (Located Token) -> Action
803 nested_comment cont span _str _len = do
807 go 0 input = do setInput input; cont
808 go n input = case alexGetChar input of
809 Nothing -> errBrace input span
810 Just ('-',input) -> case alexGetChar input of
811 Nothing -> errBrace input span
812 Just ('\125',input) -> go (n-1) input
813 Just (_,_) -> go n input
814 Just ('\123',input) -> case alexGetChar input of
815 Nothing -> errBrace input span
816 Just ('-',input) -> go (n+1) input
817 Just (_,_) -> go n input
818 Just (_,input) -> go n input
820 nested_doc_comment :: Action
821 nested_doc_comment span buf _len = withLexedDocType (go "")
823 go commentAcc input docType _ = case alexGetChar input of
824 Nothing -> errBrace input span
825 Just ('-',input) -> case alexGetChar input of
826 Nothing -> errBrace input span
827 Just ('\125',input) ->
828 docCommentEnd input commentAcc docType buf span
829 Just (_,_) -> go ('-':commentAcc) input docType False
830 Just ('\123', input) -> case alexGetChar input of
831 Nothing -> errBrace input span
832 Just ('-',input) -> do
834 let cont = do input <- getInput; go commentAcc input docType False
835 nested_comment cont span buf _len
836 Just (_,_) -> go ('\123':commentAcc) input docType False
837 Just (c,input) -> go (c:commentAcc) input docType False
839 withLexedDocType lexDocComment = do
840 input@(AI _ _ buf) <- getInput
841 case prevChar buf ' ' of
842 '|' -> lexDocComment input ITdocCommentNext False
843 '^' -> lexDocComment input ITdocCommentPrev False
844 '$' -> lexDocComment input ITdocCommentNamed False
845 '*' -> lexDocSection 1 input
846 '#' -> lexDocComment input ITdocOptionsOld False
848 lexDocSection n input = case alexGetChar input of
849 Just ('*', input) -> lexDocSection (n+1) input
850 Just (_, _) -> lexDocComment input (ITdocSection n) True
851 Nothing -> do setInput input; lexToken -- eof reached, lex it normally
854 -------------------------------------------------------------------------------
855 -- This function is quite tricky. We can't just return a new token, we also
856 -- need to update the state of the parser. Why? Because the token is longer
857 -- than what was lexed by Alex, and the lexToken function doesn't know this, so
858 -- it writes the wrong token length to the parser state. This function is
859 -- called afterwards, so it can just update the state.
861 -- This is complicated by the fact that Haddock tokens can span multiple lines,
862 -- which is something that the original lexer didn't account for.
863 -- I have added last_line_len in the parser state which represents the length
864 -- of the part of the token that is on the last line. It is now used for layout
865 -- calculation in pushCurrentContext instead of last_len. last_len is, like it
866 -- was before, the full length of the token, and it is now only used for error
869 docCommentEnd :: AlexInput -> String -> (String -> Token) -> StringBuffer ->
870 SrcSpan -> P (Located Token)
871 docCommentEnd input commentAcc docType buf span = do
873 let (AI loc last_offs nextBuf) = input
874 comment = reverse commentAcc
875 span' = mkSrcSpan (srcSpanStart span) loc
876 last_len = byteDiff buf nextBuf
878 last_line_len = if (last_offs - last_len < 0)
882 span `seq` setLastToken span' last_len last_line_len
883 return (L span' (docType comment))
885 errBrace (AI end _ _) span = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
887 open_brace, close_brace :: Action
888 open_brace span _str _len = do
890 setContext (NoLayout:ctx)
891 return (L span ITocurly)
892 close_brace span _str _len = do
894 return (L span ITccurly)
896 qvarid buf len = ITqvarid $! splitQualName buf len
897 qconid buf len = ITqconid $! splitQualName buf len
899 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
900 -- takes a StringBuffer and a length, and returns the module name
901 -- and identifier parts of a qualified name. Splits at the *last* dot,
902 -- because of hierarchical module names.
903 splitQualName orig_buf len = split orig_buf orig_buf
906 | orig_buf `byteDiff` buf >= len = done dot_buf
907 | c == '.' = found_dot buf'
908 | otherwise = split buf' dot_buf
910 (c,buf') = nextChar buf
912 -- careful, we might get names like M....
913 -- so, if the character after the dot is not upper-case, this is
914 -- the end of the qualifier part.
915 found_dot buf -- buf points after the '.'
916 | isUpper c = split buf' buf
917 | otherwise = done buf
919 (c,buf') = nextChar buf
922 (lexemeToFastString orig_buf (qual_size - 1),
923 lexemeToFastString dot_buf (len - qual_size))
925 qual_size = orig_buf `byteDiff` dot_buf
929 case lookupUFM reservedWordsFM fs of
930 Just (keyword,0) -> do
932 return (L span keyword)
933 Just (keyword,exts) -> do
934 b <- extension (\i -> exts .&. i /= 0)
935 if b then do maybe_layout keyword
936 return (L span keyword)
937 else return (L span (ITvarid fs))
938 _other -> return (L span (ITvarid fs))
940 fs = lexemeToFastString buf len
942 conid buf len = ITconid fs
943 where fs = lexemeToFastString buf len
945 qvarsym buf len = ITqvarsym $! splitQualName buf len
946 qconsym buf len = ITqconsym $! splitQualName buf len
948 varsym = sym ITvarsym
949 consym = sym ITconsym
951 sym con span buf len =
952 case lookupUFM reservedSymsFM fs of
953 Just (keyword,exts) -> do
955 if b then return (L span keyword)
956 else return (L span $! con fs)
957 _other -> return (L span $! con fs)
959 fs = lexemeToFastString buf len
961 -- Variations on the integral numeric literal.
962 tok_integral :: (Integer -> Token)
963 -> (Integer -> Integer)
964 -- -> (StringBuffer -> StringBuffer) -> (Int -> Int)
966 -> (Integer, (Char->Int)) -> Action
967 tok_integral itint transint transbuf translen (radix,char_to_int) span buf len =
968 return $ L span $ itint $! transint $ parseUnsignedInteger
969 (offsetBytes transbuf buf) (subtract translen len) radix char_to_int
971 -- some conveniences for use with tok_integral
972 tok_num = tok_integral ITinteger
973 tok_primint = tok_integral ITprimint
976 decimal = (10,octDecDigit)
977 octal = (8,octDecDigit)
978 hexadecimal = (16,hexDigit)
980 -- readRational can understand negative rationals, exponents, everything.
981 tok_float str = ITrational $! readRational str
982 tok_primfloat str = ITprimfloat $! readRational str
983 tok_primdouble str = ITprimdouble $! readRational str
985 -- -----------------------------------------------------------------------------
988 -- we're at the first token on a line, insert layout tokens if necessary
990 do_bol span _str _len = do
994 --trace "layout: inserting '}'" $ do
996 -- do NOT pop the lex state, we might have a ';' to insert
997 return (L span ITvccurly)
999 --trace "layout: inserting ';'" $ do
1001 return (L span ITsemi)
1006 -- certain keywords put us in the "layout" state, where we might
1007 -- add an opening curly brace.
1008 maybe_layout ITdo = pushLexState layout_do
1009 maybe_layout ITmdo = pushLexState layout_do
1010 maybe_layout ITof = pushLexState layout
1011 maybe_layout ITlet = pushLexState layout
1012 maybe_layout ITwhere = pushLexState layout
1013 maybe_layout ITrec = pushLexState layout
1014 maybe_layout _ = return ()
1016 -- Pushing a new implicit layout context. If the indentation of the
1017 -- next token is not greater than the previous layout context, then
1018 -- Haskell 98 says that the new layout context should be empty; that is
1019 -- the lexer must generate {}.
1021 -- We are slightly more lenient than this: when the new context is started
1022 -- by a 'do', then we allow the new context to be at the same indentation as
1023 -- the previous context. This is what the 'strict' argument is for.
1025 new_layout_context strict span _buf _len = do
1027 (AI _ offset _) <- getInput
1030 Layout prev_off : _ |
1031 (strict && prev_off >= offset ||
1032 not strict && prev_off > offset) -> do
1033 -- token is indented to the left of the previous context.
1034 -- we must generate a {} sequence now.
1035 pushLexState layout_left
1036 return (L span ITvocurly)
1038 setContext (Layout offset : ctx)
1039 return (L span ITvocurly)
1041 do_layout_left span _buf _len = do
1043 pushLexState bol -- we must be at the start of a line
1044 return (L span ITvccurly)
1046 -- -----------------------------------------------------------------------------
1049 setLine :: Int -> Action
1050 setLine code span buf len = do
1051 let line = parseUnsignedInteger buf len 10 octDecDigit
1052 setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
1053 -- subtract one: the line number refers to the *following* line
1058 setFile :: Int -> Action
1059 setFile code span buf len = do
1060 let file = lexemeToFastString (stepOn buf) (len-2)
1061 setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
1067 -- -----------------------------------------------------------------------------
1068 -- Options, includes and language pragmas.
1070 lex_string_prag :: (String -> Token) -> Action
1071 lex_string_prag mkTok span _buf _len
1072 = do input <- getInput
1076 return (L (mkSrcSpan start end) tok)
1078 = if isString input "#-}"
1079 then do setInput input
1080 return (mkTok (reverse acc))
1081 else case alexGetChar input of
1082 Just (c,i) -> go (c:acc) i
1083 Nothing -> err input
1084 isString _ [] = True
1086 = case alexGetChar i of
1087 Just (c,i') | c == x -> isString i' xs
1089 err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
1092 -- -----------------------------------------------------------------------------
1095 -- This stuff is horrible. I hates it.
1097 lex_string_tok :: Action
1098 lex_string_tok span _buf _len = do
1099 tok <- lex_string ""
1101 return (L (mkSrcSpan (srcSpanStart span) end) tok)
1103 lex_string :: String -> P Token
1106 case alexGetChar' i of
1107 Nothing -> lit_error
1111 magicHash <- extension magicHashEnabled
1115 case alexGetChar' i of
1119 then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
1120 else let s' = mkZFastString (reverse s) in
1121 return (ITprimstring s')
1122 -- mkZFastString is a hack to avoid encoding the
1123 -- string in UTF-8. We just want the exact bytes.
1125 return (ITstring (mkFastString (reverse s)))
1127 return (ITstring (mkFastString (reverse s)))
1130 | Just ('&',i) <- next -> do
1131 setInput i; lex_string s
1132 | Just (c,i) <- next, is_space c -> do
1133 setInput i; lex_stringgap s
1134 where next = alexGetChar' i
1140 lex_stringgap s = do
1143 '\\' -> lex_string s
1144 c | is_space c -> lex_stringgap s
1148 lex_char_tok :: Action
1149 -- Here we are basically parsing character literals, such as 'x' or '\n'
1150 -- but, when Template Haskell is on, we additionally spot
1151 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
1152 -- but WIHTOUT CONSUMING the x or T part (the parser does that).
1153 -- So we have to do two characters of lookahead: when we see 'x we need to
1154 -- see if there's a trailing quote
1155 lex_char_tok span _buf _len = do -- We've seen '
1156 i1 <- getInput -- Look ahead to first character
1157 let loc = srcSpanStart span
1158 case alexGetChar' i1 of
1159 Nothing -> lit_error
1161 Just ('\'', i2@(AI end2 _ _)) -> do -- We've seen ''
1162 th_exts <- extension thEnabled
1165 return (L (mkSrcSpan loc end2) ITtyQuote)
1168 Just ('\\', i2@(AI _end2 _ _)) -> do -- We've seen 'backslash
1170 lit_ch <- lex_escape
1171 mc <- getCharOrFail -- Trailing quote
1172 if mc == '\'' then finish_char_tok loc lit_ch
1173 else do setInput i2; lit_error
1175 Just (c, i2@(AI _end2 _ _))
1176 | not (isAny c) -> lit_error
1179 -- We've seen 'x, where x is a valid character
1180 -- (i.e. not newline etc) but not a quote or backslash
1181 case alexGetChar' i2 of -- Look ahead one more character
1182 Nothing -> lit_error
1183 Just ('\'', i3) -> do -- We've seen 'x'
1185 finish_char_tok loc c
1186 _other -> do -- We've seen 'x not followed by quote
1187 -- If TH is on, just parse the quote only
1188 th_exts <- extension thEnabled
1189 let (AI end _ _) = i1
1190 if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1191 else do setInput i2; lit_error
1193 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1194 finish_char_tok loc ch -- We've already seen the closing quote
1195 -- Just need to check for trailing #
1196 = do magicHash <- extension magicHashEnabled
1197 i@(AI end _ _) <- getInput
1198 if magicHash then do
1199 case alexGetChar' i of
1200 Just ('#',i@(AI end _ _)) -> do
1202 return (L (mkSrcSpan loc end) (ITprimchar ch))
1204 return (L (mkSrcSpan loc end) (ITchar ch))
1206 return (L (mkSrcSpan loc end) (ITchar ch))
1208 lex_char :: Char -> AlexInput -> P Char
1211 '\\' -> do setInput inp; lex_escape
1212 c | isAny c -> do setInput inp; return c
1215 isAny c | c > '\xff' = isPrint c
1216 | otherwise = is_any c
1218 lex_escape :: P Char
1232 '^' -> do c <- getCharOrFail
1233 if c >= '@' && c <= '_'
1234 then return (chr (ord c - ord '@'))
1237 'x' -> readNum is_hexdigit 16 hexDigit
1238 'o' -> readNum is_octdigit 8 octDecDigit
1239 x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1243 case alexGetChar' i of
1244 Nothing -> lit_error
1246 case alexGetChar' i2 of
1247 Nothing -> do setInput i2; lit_error
1249 let str = [c1,c2,c3] in
1250 case [ (c,rest) | (p,c) <- silly_escape_chars,
1251 Just rest <- [maybePrefixMatch p str] ] of
1252 (escape_char,[]):_ -> do
1255 (escape_char,_:_):_ -> do
1260 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1261 readNum is_digit base conv = do
1265 then readNum2 is_digit base conv (conv c)
1266 else do setInput i; lit_error
1268 readNum2 is_digit base conv i = do
1271 where read i input = do
1272 case alexGetChar' input of
1273 Just (c,input') | is_digit c -> do
1274 read (i*base + conv c) input'
1276 if i >= 0 && i <= 0x10FFFF
1277 then do setInput input; return (chr i)
1280 silly_escape_chars = [
1317 -- before calling lit_error, ensure that the current input is pointing to
1318 -- the position of the error in the buffer. This is so that we can report
1319 -- a correct location to the user, but also so we can detect UTF-8 decoding
1320 -- errors if they occur.
1321 lit_error = lexError "lexical error in string/character literal"
1323 getCharOrFail :: P Char
1326 case alexGetChar' i of
1327 Nothing -> lexError "unexpected end-of-file in string/character literal"
1328 Just (c,i) -> do setInput i; return c
1330 -- -----------------------------------------------------------------------------
1333 lex_quasiquote_tok :: Action
1334 lex_quasiquote_tok span buf len = do
1335 let quoter = reverse $ takeWhile (/= '$')
1336 $ reverse $ lexemeToString buf (len - 1)
1337 quoteStart <- getSrcLoc
1338 quote <- lex_quasiquote ""
1340 return (L (mkSrcSpan (srcSpanStart span) end)
1341 (ITquasiQuote (mkFastString quoter,
1342 mkFastString (reverse quote),
1343 mkSrcSpan quoteStart end)))
1345 lex_quasiquote :: String -> P String
1346 lex_quasiquote s = do
1348 case alexGetChar' i of
1349 Nothing -> lit_error
1352 | Just ('|',i) <- next -> do
1353 setInput i; lex_quasiquote ('|' : s)
1354 | Just (']',i) <- next -> do
1355 setInput i; lex_quasiquote (']' : s)
1356 where next = alexGetChar' i
1359 | Just (']',i) <- next -> do
1360 setInput i; return s
1361 where next = alexGetChar' i
1364 setInput i; lex_quasiquote (c : s)
1366 -- -----------------------------------------------------------------------------
1369 warn :: DynFlag -> SDoc -> Action
1370 warn option warning srcspan _buf _len = do
1371 addWarning option srcspan warning
1374 -- -----------------------------------------------------------------------------
1385 SrcSpan -- The start and end of the text span related to
1386 -- the error. Might be used in environments which can
1387 -- show this span, e.g. by highlighting it.
1388 Message -- The error message
1390 data PState = PState {
1391 buffer :: StringBuffer,
1393 messages :: Messages,
1394 last_loc :: SrcSpan, -- pos of previous token
1395 last_offs :: !Int, -- offset of the previous token from the
1396 -- beginning of the current line.
1397 -- \t is equal to 8 spaces.
1398 last_len :: !Int, -- len of previous token
1399 last_line_len :: !Int,
1400 loc :: SrcLoc, -- current loc (end of prev token + 1)
1401 extsBitmap :: !Int, -- bitmap that determines permitted extensions
1402 context :: [LayoutContext],
1405 -- last_loc and last_len are used when generating error messages,
1406 -- and in pushCurrentContext only. Sigh, if only Happy passed the
1407 -- current token to happyError, we could at least get rid of last_len.
1408 -- Getting rid of last_loc would require finding another way to
1409 -- implement pushCurrentContext (which is only called from one place).
1411 newtype P a = P { unP :: PState -> ParseResult a }
1413 instance Monad P where
1419 returnP a = a `seq` (P $ \s -> POk s a)
1421 thenP :: P a -> (a -> P b) -> P b
1422 (P m) `thenP` k = P $ \ s ->
1424 POk s1 a -> (unP (k a)) s1
1425 PFailed span err -> PFailed span err
1427 failP :: String -> P a
1428 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1430 failMsgP :: String -> P a
1431 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1433 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1434 failLocMsgP loc1 loc2 str = P $ \_ -> PFailed (mkSrcSpan loc1 loc2) (text str)
1436 failSpanMsgP :: SrcSpan -> SDoc -> P a
1437 failSpanMsgP span msg = P $ \_ -> PFailed span msg
1439 extension :: (Int -> Bool) -> P Bool
1440 extension p = P $ \s -> POk s (p $! extsBitmap s)
1443 getExts = P $ \s -> POk s (extsBitmap s)
1445 setSrcLoc :: SrcLoc -> P ()
1446 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1448 getSrcLoc :: P SrcLoc
1449 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1451 setLastToken :: SrcSpan -> Int -> Int -> P ()
1452 setLastToken loc len line_len = P $ \s -> POk s {
1455 last_line_len=line_len
1458 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1460 alexInputPrevChar :: AlexInput -> Char
1461 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1463 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1464 alexGetChar (AI loc ofs s)
1466 | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1467 --trace (show (ord c)) $
1468 Just (adj_c, (AI loc' ofs' s'))
1469 where (c,s') = nextChar s
1470 loc' = advanceSrcLoc loc c
1471 ofs' = advanceOffs c ofs
1479 other_graphic = '\x6'
1482 | c <= '\x06' = non_graphic
1484 -- Alex doesn't handle Unicode, so when Unicode
1485 -- character is encoutered we output these values
1486 -- with the actual character value hidden in the state.
1488 case generalCategory c of
1489 UppercaseLetter -> upper
1490 LowercaseLetter -> lower
1491 TitlecaseLetter -> upper
1492 ModifierLetter -> other_graphic
1493 OtherLetter -> other_graphic
1494 NonSpacingMark -> other_graphic
1495 SpacingCombiningMark -> other_graphic
1496 EnclosingMark -> other_graphic
1497 DecimalNumber -> digit
1498 LetterNumber -> other_graphic
1499 OtherNumber -> other_graphic
1500 ConnectorPunctuation -> other_graphic
1501 DashPunctuation -> other_graphic
1502 OpenPunctuation -> other_graphic
1503 ClosePunctuation -> other_graphic
1504 InitialQuote -> other_graphic
1505 FinalQuote -> other_graphic
1506 OtherPunctuation -> other_graphic
1507 MathSymbol -> symbol
1508 CurrencySymbol -> symbol
1509 ModifierSymbol -> symbol
1510 OtherSymbol -> symbol
1512 _other -> non_graphic
1514 -- This version does not squash unicode characters, it is used when
1516 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1517 alexGetChar' (AI loc ofs s)
1519 | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1520 --trace (show (ord c)) $
1521 Just (c, (AI loc' ofs' s'))
1522 where (c,s') = nextChar s
1523 loc' = advanceSrcLoc loc c
1524 ofs' = advanceOffs c ofs
1526 advanceOffs :: Char -> Int -> Int
1527 advanceOffs '\n' _ = 0
1528 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1529 advanceOffs _ offs = offs + 1
1531 getInput :: P AlexInput
1532 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1534 setInput :: AlexInput -> P ()
1535 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1537 pushLexState :: Int -> P ()
1538 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1540 popLexState :: P Int
1541 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1543 getLexState :: P Int
1544 getLexState = P $ \s@PState{ lex_state=ls:_ } -> POk s ls
1546 -- for reasons of efficiency, flags indicating language extensions (eg,
1547 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1550 genericsBit, ffiBit, parrBit :: Int
1551 genericsBit = 0 -- {| and |}
1557 explicitForallBit = 7 -- the 'forall' keyword and '.' symbol
1558 bangPatBit = 8 -- Tells the parser to understand bang-patterns
1559 -- (doesn't affect the lexer)
1560 tyFamBit = 9 -- indexed type families: 'family' keyword and kind sigs
1561 haddockBit = 10 -- Lex and parse Haddock comments
1562 magicHashBit = 11 -- # in both functions and operators
1563 kindSigsBit = 12 -- Kind signatures on type variables
1564 recursiveDoBit = 13 -- mdo
1565 unicodeSyntaxBit = 14 -- the forall symbol, arrow symbols, etc
1566 unboxedTuplesBit = 15 -- (# and #)
1567 standaloneDerivingBit = 16 -- standalone instance deriving declarations
1568 transformComprehensionsBit = 17
1569 qqBit = 18 -- enable quasiquoting
1571 genericsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1573 genericsEnabled flags = testBit flags genericsBit
1574 ffiEnabled flags = testBit flags ffiBit
1575 parrEnabled flags = testBit flags parrBit
1576 arrowsEnabled flags = testBit flags arrowsBit
1577 thEnabled flags = testBit flags thBit
1578 ipEnabled flags = testBit flags ipBit
1579 explicitForallEnabled flags = testBit flags explicitForallBit
1580 bangPatEnabled flags = testBit flags bangPatBit
1581 tyFamEnabled flags = testBit flags tyFamBit
1582 haddockEnabled flags = testBit flags haddockBit
1583 magicHashEnabled flags = testBit flags magicHashBit
1584 kindSigsEnabled flags = testBit flags kindSigsBit
1585 recursiveDoEnabled flags = testBit flags recursiveDoBit
1586 unicodeSyntaxEnabled flags = testBit flags unicodeSyntaxBit
1587 unboxedTuplesEnabled flags = testBit flags unboxedTuplesBit
1588 standaloneDerivingEnabled flags = testBit flags standaloneDerivingBit
1589 transformComprehensionsEnabled flags = testBit flags transformComprehensionsBit
1590 qqEnabled flags = testBit flags qqBit
1592 -- PState for parsing options pragmas
1594 pragState :: StringBuffer -> SrcLoc -> PState
1598 messages = emptyMessages,
1599 -- XXX defaultDynFlags is not right, but we don't have a real
1601 dflags = defaultDynFlags,
1602 last_loc = mkSrcSpan loc loc,
1609 lex_state = [bol, option_prags, 0]
1613 -- create a parse state
1615 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1616 mkPState buf loc flags =
1620 messages = emptyMessages,
1621 last_loc = mkSrcSpan loc loc,
1626 extsBitmap = fromIntegral bitmap,
1628 lex_state = [bol, 0]
1629 -- we begin in the layout state if toplev_layout is set
1632 bitmap = genericsBit `setBitIf` dopt Opt_Generics flags
1633 .|. ffiBit `setBitIf` dopt Opt_ForeignFunctionInterface flags
1634 .|. parrBit `setBitIf` dopt Opt_PArr flags
1635 .|. arrowsBit `setBitIf` dopt Opt_Arrows flags
1636 .|. thBit `setBitIf` dopt Opt_TemplateHaskell flags
1637 .|. qqBit `setBitIf` dopt Opt_QuasiQuotes flags
1638 .|. ipBit `setBitIf` dopt Opt_ImplicitParams flags
1639 .|. explicitForallBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1640 .|. explicitForallBit `setBitIf` dopt Opt_PolymorphicComponents flags
1641 .|. explicitForallBit `setBitIf` dopt Opt_ExistentialQuantification flags
1642 .|. explicitForallBit `setBitIf` dopt Opt_Rank2Types flags
1643 .|. explicitForallBit `setBitIf` dopt Opt_RankNTypes flags
1644 .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1645 .|. tyFamBit `setBitIf` dopt Opt_TypeFamilies flags
1646 .|. haddockBit `setBitIf` dopt Opt_Haddock flags
1647 .|. magicHashBit `setBitIf` dopt Opt_MagicHash flags
1648 .|. kindSigsBit `setBitIf` dopt Opt_KindSignatures flags
1649 .|. recursiveDoBit `setBitIf` dopt Opt_RecursiveDo flags
1650 .|. unicodeSyntaxBit `setBitIf` dopt Opt_UnicodeSyntax flags
1651 .|. unboxedTuplesBit `setBitIf` dopt Opt_UnboxedTuples flags
1652 .|. standaloneDerivingBit `setBitIf` dopt Opt_StandaloneDeriving flags
1653 .|. transformComprehensionsBit `setBitIf` dopt Opt_TransformListComp flags
1655 setBitIf :: Int -> Bool -> Int
1656 b `setBitIf` cond | cond = bit b
1659 addWarning :: DynFlag -> SrcSpan -> SDoc -> P ()
1660 addWarning option srcspan warning
1661 = P $ \s@PState{messages=(ws,es), dflags=d} ->
1662 let warning' = mkWarnMsg srcspan alwaysQualify warning
1663 ws' = if dopt option d then ws `snocBag` warning' else ws
1664 in POk s{messages=(ws', es)} ()
1666 getMessages :: PState -> Messages
1667 getMessages PState{messages=ms} = ms
1669 getContext :: P [LayoutContext]
1670 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1672 setContext :: [LayoutContext] -> P ()
1673 setContext ctx = P $ \s -> POk s{context=ctx} ()
1676 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1677 last_len = len, last_loc = last_loc }) ->
1679 (_:tl) -> POk s{ context = tl } ()
1680 [] -> PFailed last_loc (srcParseErr buf len)
1682 -- Push a new layout context at the indentation of the last token read.
1683 -- This is only used at the outer level of a module when the 'module'
1684 -- keyword is missing.
1685 pushCurrentContext :: P ()
1686 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_line_len=len, context=ctx } ->
1687 POk s{context = Layout (offs-len) : ctx} ()
1688 --trace ("off: " ++ show offs ++ ", len: " ++ show len) $ POk s{context = Layout (offs-len) : ctx} ()
1690 getOffside :: P Ordering
1691 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1692 let ord = case stk of
1693 (Layout n:_) -> compare offs n
1697 -- ---------------------------------------------------------------------------
1698 -- Construct a parse error
1701 :: StringBuffer -- current buffer (placed just after the last token)
1702 -> Int -- length of the previous token
1705 = hcat [ if null token
1706 then ptext (sLit "parse error (possibly incorrect indentation)")
1707 else hcat [ptext (sLit "parse error on input "),
1708 char '`', text token, char '\'']
1710 where token = lexemeToString (offsetBytes (-len) buf) len
1712 -- Report a parse failure, giving the span of the previous token as
1713 -- the location of the error. This is the entry point for errors
1714 -- detected during parsing.
1716 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1717 last_loc = last_loc } ->
1718 PFailed last_loc (srcParseErr buf len)
1720 -- A lexical error is reported at a particular position in the source file,
1721 -- not over a token range.
1722 lexError :: String -> P a
1725 (AI end _ buf) <- getInput
1726 reportLexError loc end buf str
1728 -- -----------------------------------------------------------------------------
1729 -- This is the top-level function: called from the parser each time a
1730 -- new token is to be read from the input.
1732 lexer :: (Located Token -> P a) -> P a
1734 tok@(L _span _tok__) <- lexToken
1735 -- trace ("token: " ++ show tok__) $ do
1738 lexToken :: P (Located Token)
1740 inp@(AI loc1 _ buf) <- getInput
1743 case alexScanUser exts inp sc of
1745 let span = mkSrcSpan loc1 loc1
1746 setLastToken span 0 0
1747 return (L span ITeof)
1748 AlexError (AI loc2 _ buf) ->
1749 reportLexError loc1 loc2 buf "lexical error"
1750 AlexSkip inp2 _ -> do
1753 AlexToken inp2@(AI end _ buf2) _ t -> do
1755 let span = mkSrcSpan loc1 end
1756 let bytes = byteDiff buf buf2
1757 span `seq` setLastToken span bytes bytes
1760 reportLexError loc1 loc2 buf str
1761 | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1764 c = fst (nextChar buf)
1766 if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1767 then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1768 else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)