1 -----------------------------------------------------------------------------
2 -- (c) The University of Glasgow, 2003
6 -- This is a combination of an Alex-generated lexer from a regex
7 -- definition, with some hand-coded bits.
9 -- Completely accurate information about token-spans within the source
10 -- file is maintained. Every token has a start and end SrcLoc attached to it.
12 -----------------------------------------------------------------------------
16 -- - parsing integers is a bit slow
17 -- - readRational is a bit slow
19 -- Known bugs, that were also in the previous version:
20 -- - M... should be 3 tokens, not 1.
21 -- - pragma-end should be only valid in a pragma
25 Token(..), lexer, mkPState, PState(..),
26 P(..), ParseResult(..), getSrcLoc,
27 failLocMsgP, failSpanMsgP, srcParseFail,
28 popContext, pushCurrentContext, setLastToken, setSrcLoc,
29 getLexState, popLexState, pushLexState
32 #include "HsVersions.h"
34 import ErrUtils ( Message )
43 import Util ( maybePrefixMatch, readRational )
51 $whitechar = [\ \t\n\r\f\v\xa0]
52 $white_no_nl = $whitechar # \n
56 $digit = [$ascdigit $unidigit]
58 $special = [\(\)\,\;\[\]\`\{\}]
59 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~]
61 $symbol = [$ascsymbol $unisymbol] # [$special \_\:\"\']
64 $asclarge = [A-Z \xc0-\xd6 \xd8-\xde]
65 $large = [$asclarge $unilarge]
68 $ascsmall = [a-z \xdf-\xf6 \xf8-\xff]
69 $small = [$ascsmall $unismall \_]
71 $graphic = [$small $large $symbol $digit $special \:\"\']
74 $hexit = [$digit A-F a-f]
75 $symchar = [$symbol \:]
77 $idchar = [$small $large $digit \']
79 @varid = $small $idchar*
80 @conid = $large $idchar*
82 @varsym = $symbol $symchar*
83 @consym = \: $symchar*
87 @hexadecimal = $hexit+
88 @exponent = [eE] [\-\+]? @decimal
90 -- we support the hierarchical module name extension:
93 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
97 -- everywhere: skip whitespace and comments
100 -- Everywhere: deal with nested comments. We explicitly rule out
101 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
102 -- (this can happen even though pragmas will normally take precedence due to
103 -- longest-match, because pragmas aren't valid in every state, but comments
105 "{-" / { notFollowedBy '#' } { nested_comment }
107 -- Single-line comments are a bit tricky. Haskell 98 says that two or
108 -- more dashes followed by a symbol should be parsed as a varsym, so we
109 -- have to exclude those.
110 -- The regex says: "munch all the characters after the dashes, as long as
111 -- the first one is not a symbol".
112 "--"\-* [^$symbol :] .* ;
113 "--"\-* / { atEOL } ;
115 -- 'bol' state: beginning of a line. Slurp up all the whitespace (including
116 -- blank lines) until we find a non-whitespace character, then do layout
119 -- One slight wibble here: what if the line begins with {-#? In
120 -- theory, we have to lex the pragma to see if it's one we recognise,
121 -- and if it is, then we backtrack and do_bol, otherwise we treat it
122 -- as a nested comment. We don't bother with this: if the line begins
123 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
126 ^\# (line)? { begin line_prag1 }
127 ^\# pragma .* \n ; -- GCC 3.3 CPP generated, apparently
128 ^\# \! .* \n ; -- #!, for scripts
132 -- after a layout keyword (let, where, do, of), we begin a new layout
133 -- context if the curly brace is missing.
134 -- Careful! This stuff is quite delicate.
135 <layout, layout_do> {
136 \{ / { notFollowedBy '-' } { pop_and open_brace }
137 -- we might encounter {-# here, but {- has been handled already
139 ^\# (line)? { begin line_prag1 }
142 -- do is treated in a subtly different way, see new_layout_context
143 <layout> () { new_layout_context True }
144 <layout_do> () { new_layout_context False }
146 -- after a new layout context which was found to be to the left of the
147 -- previous context, we have generated a '{' token, and we now need to
148 -- generate a matching '}' token.
149 <layout_left> () { do_layout_left }
151 <0,glaexts> \n { begin bol }
153 "{-#" $whitechar* (line|LINE) { begin line_prag2 }
155 -- single-line line pragmas, of the form
156 -- # <line> "<file>" <extra-stuff> \n
157 <line_prag1> $digit+ { setLine line_prag1a }
158 <line_prag1a> \" [$graphic \ ]* \" { setFile line_prag1b }
159 <line_prag1b> .* { pop }
161 -- Haskell-style line pragmas, of the form
162 -- {-# LINE <line> "<file>" #-}
163 <line_prag2> $digit+ { setLine line_prag2a }
164 <line_prag2a> \" [$graphic \ ]* \" { setFile line_prag2b }
165 <line_prag2b> "#-}"|"-}" { pop }
166 -- NOTE: accept -} at the end of a LINE pragma, for compatibility
167 -- with older versions of GHC which generated these.
169 -- We only want RULES pragmas to be picked up when -fglasgow-exts
170 -- is on, because the contents of the pragma is always written using
171 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
172 -- enabled, we're sure to get a parse error.
173 -- (ToDo: we should really emit a warning when ignoring pragmas)
175 "{-#" $whitechar* (RULES|rules) { token ITrules_prag }
178 "{-#" $whitechar* (INLINE|inline) { token (ITinline_prag True) }
179 "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
180 { token (ITinline_prag False) }
181 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
182 { token ITspec_prag }
183 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
184 $whitechar* (INLINE|inline) { token (ITspec_inline_prag True) }
185 "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
186 $whitechar* (NO(T?)INLINE|no(t?)inline)
187 { token (ITspec_inline_prag False) }
188 "{-#" $whitechar* (SOURCE|source) { token ITsource_prag }
189 "{-#" $whitechar* (DEPRECATED|deprecated)
190 { token ITdeprecated_prag }
191 "{-#" $whitechar* (SCC|scc) { token ITscc_prag }
192 "{-#" $whitechar* (CORE|core) { token ITcore_prag }
193 "{-#" $whitechar* (UNPACK|unpack) { token ITunpack_prag }
195 "{-#" { nested_comment }
197 -- ToDo: should only be valid inside a pragma:
198 "#-}" { token ITclose_prag}
202 -- '0' state: ordinary lexemes
203 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
208 "[:" / { ifExtension parrEnabled } { token ITopabrack }
209 ":]" / { ifExtension parrEnabled } { token ITcpabrack }
213 "[|" / { ifExtension thEnabled } { token ITopenExpQuote }
214 "[e|" / { ifExtension thEnabled } { token ITopenExpQuote }
215 "[p|" / { ifExtension thEnabled } { token ITopenPatQuote }
216 "[d|" / { ifExtension thEnabled } { layout_token ITopenDecQuote }
217 "[t|" / { ifExtension thEnabled } { token ITopenTypQuote }
218 "|]" / { ifExtension thEnabled } { token ITcloseQuote }
219 \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
220 "$(" / { ifExtension thEnabled } { token ITparenEscape }
224 "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
225 { special IToparenbar }
226 "|)" / { ifExtension arrowsEnabled } { special ITcparenbar }
230 \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
231 \% @varid / { ifExtension ipEnabled } { skip_one_varid ITsplitipvarid }
235 "(#" / { notFollowedBySymbol } { token IToubxparen }
236 "#)" { token ITcubxparen }
237 "{|" { token ITocurlybar }
238 "|}" { token ITccurlybar }
242 \( { special IToparen }
243 \) { special ITcparen }
244 \[ { special ITobrack }
245 \] { special ITcbrack }
246 \, { special ITcomma }
247 \; { special ITsemi }
248 \` { special ITbackquote }
255 @qual @varid { check_qvarid }
256 @qual @conid { idtoken qconid }
258 @conid { idtoken conid }
261 -- after an illegal qvarid, such as 'M.let',
262 -- we back up and try again in the bad_qvarid state:
264 @conid { pop_and (idtoken conid) }
265 @qual @conid { pop_and (idtoken qconid) }
269 @qual @varid "#"+ { idtoken qvarid }
270 @qual @conid "#"+ { idtoken qconid }
271 @varid "#"+ { varid }
272 @conid "#"+ { idtoken conid }
278 @qual @varsym { idtoken qvarsym }
279 @qual @consym { idtoken qconsym }
285 @decimal { tok_decimal }
286 0[oO] @octal { tok_octal }
287 0[xX] @hexadecimal { tok_hexadecimal }
291 @decimal \# { prim_decimal }
292 0[oO] @octal \# { prim_octal }
293 0[xX] @hexadecimal \# { prim_hexadecimal }
296 <0,glaexts> @floating_point { strtoken tok_float }
297 <glaexts> @floating_point \# { init_strtoken 1 prim_float }
298 <glaexts> @floating_point \# \# { init_strtoken 2 prim_double }
300 -- Strings and chars are lexed by hand-written code. The reason is
301 -- that even if we recognise the string or char here in the regex
302 -- lexer, we would still have to parse the string afterward in order
303 -- to convert it to a String.
306 \" { lex_string_tok }
310 -- work around bug in Alex 2.0
311 #if __GLASGOW_HASKELL__ < 503
312 unsafeAt arr i = arr ! i
315 -- -----------------------------------------------------------------------------
319 = ITas -- Haskell keywords
343 | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now)
345 | ITforall -- GHC extension keywords
359 | ITinline_prag Bool -- True <=> INLINE, False <=> NOINLINE
360 | ITspec_prag -- SPECIALISE
361 | ITspec_inline_prag Bool -- SPECIALISE INLINE (or NOINLINE)
367 | ITcore_prag -- hdaume: core annotations
371 | ITdotdot -- reserved symbols
387 | ITbiglam -- GHC-extension symbols
389 | ITocurly -- special symbols
391 | ITocurlybar -- {|, for type applications
392 | ITccurlybar -- |}, for type applications
396 | ITopabrack -- [:, for parallel arrays with -fparr
397 | ITcpabrack -- :], for parallel arrays with -fparr
408 | ITvarid FastString -- identifiers
410 | ITvarsym FastString
411 | ITconsym FastString
412 | ITqvarid (FastString,FastString)
413 | ITqconid (FastString,FastString)
414 | ITqvarsym (FastString,FastString)
415 | ITqconsym (FastString,FastString)
417 | ITdupipvarid FastString -- GHC extension: implicit param: ?x
418 | ITsplitipvarid FastString -- GHC extension: implicit param: %x
420 | ITpragma StringBuffer
423 | ITstring FastString
425 | ITrational Rational
428 | ITprimstring FastString
430 | ITprimfloat Rational
431 | ITprimdouble Rational
433 -- MetaHaskell extension tokens
434 | ITopenExpQuote -- [| or [e|
435 | ITopenPatQuote -- [p|
436 | ITopenDecQuote -- [d|
437 | ITopenTypQuote -- [t|
439 | ITidEscape FastString -- $x
440 | ITparenEscape -- $(
444 -- Arrow notation extension
451 | ITLarrowtail -- -<<
452 | ITRarrowtail -- >>-
454 | ITunknown String -- Used when the lexer can't make sense of it
455 | ITeof -- end of file token
457 deriving Show -- debugging
460 isSpecial :: Token -> Bool
461 -- If we see M.x, where x is a keyword, but
462 -- is special, we treat is as just plain M.x,
464 isSpecial ITas = True
465 isSpecial IThiding = True
466 isSpecial ITqualified = True
467 isSpecial ITforall = True
468 isSpecial ITexport = True
469 isSpecial ITlabel = True
470 isSpecial ITdynamic = True
471 isSpecial ITsafe = True
472 isSpecial ITthreadsafe = True
473 isSpecial ITunsafe = True
474 isSpecial ITccallconv = True
475 isSpecial ITstdcallconv = True
476 isSpecial ITmdo = True
479 -- the bitmap provided as the third component indicates whether the
480 -- corresponding extension keyword is valid under the extension options
481 -- provided to the compiler; if the extension corresponding to *any* of the
482 -- bits set in the bitmap is enabled, the keyword is valid (this setup
483 -- facilitates using a keyword in two different extensions that can be
484 -- activated independently)
486 reservedWordsFM = listToUFM $
487 map (\(x, y, z) -> (mkFastString x, (y, z)))
488 [( "_", ITunderscore, 0 ),
490 ( "case", ITcase, 0 ),
491 ( "class", ITclass, 0 ),
492 ( "data", ITdata, 0 ),
493 ( "default", ITdefault, 0 ),
494 ( "deriving", ITderiving, 0 ),
496 ( "else", ITelse, 0 ),
497 ( "hiding", IThiding, 0 ),
499 ( "import", ITimport, 0 ),
501 ( "infix", ITinfix, 0 ),
502 ( "infixl", ITinfixl, 0 ),
503 ( "infixr", ITinfixr, 0 ),
504 ( "instance", ITinstance, 0 ),
506 ( "module", ITmodule, 0 ),
507 ( "newtype", ITnewtype, 0 ),
509 ( "qualified", ITqualified, 0 ),
510 ( "then", ITthen, 0 ),
511 ( "type", ITtype, 0 ),
512 ( "where", ITwhere, 0 ),
513 ( "_scc_", ITscc, 0 ), -- ToDo: remove
515 ( "forall", ITforall, bit tvBit),
516 ( "mdo", ITmdo, bit glaExtsBit),
518 ( "foreign", ITforeign, bit ffiBit),
519 ( "export", ITexport, bit ffiBit),
520 ( "label", ITlabel, bit ffiBit),
521 ( "dynamic", ITdynamic, bit ffiBit),
522 ( "safe", ITsafe, bit ffiBit),
523 ( "threadsafe", ITthreadsafe, bit ffiBit),
524 ( "unsafe", ITunsafe, bit ffiBit),
525 ( "stdcall", ITstdcallconv, bit ffiBit),
526 ( "ccall", ITccallconv, bit ffiBit),
527 ( "dotnet", ITdotnet, bit ffiBit),
529 ( "rec", ITrec, bit arrowsBit),
530 ( "proc", ITproc, bit arrowsBit)
533 reservedSymsFM = listToUFM $
534 map (\ (x,y,z) -> (mkFastString x,(y,z)))
535 [ ("..", ITdotdot, 0)
536 ,(":", ITcolon, 0) -- (:) is a reserved op,
537 -- meaning only list cons
550 ,("*", ITstar, bit glaExtsBit) -- For data T (a::*) = MkT
551 ,(".", ITdot, bit tvBit) -- For 'forall a . t'
553 ,("-<", ITlarrowtail, bit arrowsBit)
554 ,(">-", ITrarrowtail, bit arrowsBit)
555 ,("-<<", ITLarrowtail, bit arrowsBit)
556 ,(">>-", ITRarrowtail, bit arrowsBit)
559 -- -----------------------------------------------------------------------------
562 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
564 special :: Token -> Action
565 special tok span _buf len = return (L span tok)
567 token, layout_token :: Token -> Action
568 token t span buf len = return (L span t)
569 layout_token t span buf len = pushLexState layout >> return (L span t)
571 idtoken :: (StringBuffer -> Int -> Token) -> Action
572 idtoken f span buf len = return (L span $! (f buf len))
574 skip_one_varid :: (FastString -> Token) -> Action
575 skip_one_varid f span buf len
576 = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
578 strtoken :: (String -> Token) -> Action
579 strtoken f span buf len =
580 return (L span $! (f $! lexemeToString buf len))
582 init_strtoken :: Int -> (String -> Token) -> Action
583 -- like strtoken, but drops the last N character(s)
584 init_strtoken drop f span buf len =
585 return (L span $! (f $! lexemeToString buf (len-drop)))
587 begin :: Int -> Action
588 begin code _span _str _len = do pushLexState code; lexToken
591 pop _span _buf _len = do popLexState; lexToken
593 pop_and :: Action -> Action
594 pop_and act span buf len = do popLexState; act span buf len
596 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
598 notFollowedBySymbol _ _ _ (AI _ _ buf)
599 = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
601 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
603 ifExtension pred bits _ _ _ = pred bits
606 nested comments require traversing by hand, they can't be parsed
607 using regular expressions.
609 nested_comment :: Action
610 nested_comment span _str _len = do
613 where go 0 input = do setInput input; lexToken
615 case alexGetChar input of
620 case alexGetChar input of
622 Just ('\125',input) -> go (n-1) input
623 Just (c,_) -> go n input
625 case alexGetChar input of
627 Just ('-',input') -> go (n+1) input'
628 Just (c,input) -> go n input
631 err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
633 open_brace, close_brace :: Action
634 open_brace span _str _len = do
636 setContext (NoLayout:ctx)
637 return (L span ITocurly)
638 close_brace span _str _len = do
640 return (L span ITccurly)
642 -- We have to be careful not to count M.<varid> as a qualified name
643 -- when <varid> is a keyword. We hack around this by catching
644 -- the offending tokens afterward, and re-lexing in a different state.
645 check_qvarid span buf len = do
646 case lookupUFM reservedWordsFM var of
648 | not (isSpecial keyword) ->
652 b <- extension (\i -> exts .&. i /= 0)
655 _other -> return token
657 (mod,var) = splitQualName buf len
658 token = L span (ITqvarid (mod,var))
661 (AI _ offs _) <- getInput
662 setInput (AI (srcSpanStart span) (offs-len) buf)
663 pushLexState bad_qvarid
666 qvarid buf len = ITqvarid $! splitQualName buf len
667 qconid buf len = ITqconid $! splitQualName buf len
669 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
670 -- takes a StringBuffer and a length, and returns the module name
671 -- and identifier parts of a qualified name. Splits at the *last* dot,
672 -- because of hierarchical module names.
673 splitQualName orig_buf len = split orig_buf 0 0
676 | n == len = done dot_off
677 | lookAhead buf n == '.' = split2 buf n (n+1)
678 | otherwise = split buf dot_off (n+1)
680 -- careful, we might get names like M....
681 -- so, if the character after the dot is not upper-case, this is
682 -- the end of the qualifier part.
684 | isUpper (lookAhead buf n) = split buf dot_off (n+1)
685 | otherwise = done dot_off
688 (lexemeToFastString orig_buf dot_off,
689 lexemeToFastString (stepOnBy (dot_off+1) orig_buf) (len - dot_off -1))
692 case lookupUFM reservedWordsFM fs of
693 Just (keyword,0) -> do
695 return (L span keyword)
696 Just (keyword,exts) -> do
697 b <- extension (\i -> exts .&. i /= 0)
698 if b then do maybe_layout keyword
699 return (L span keyword)
700 else return (L span (ITvarid fs))
701 _other -> return (L span (ITvarid fs))
703 fs = lexemeToFastString buf len
705 conid buf len = ITconid fs
706 where fs = lexemeToFastString buf len
708 qvarsym buf len = ITqvarsym $! splitQualName buf len
709 qconsym buf len = ITqconsym $! splitQualName buf len
711 varsym = sym ITvarsym
712 consym = sym ITconsym
714 sym con span buf len =
715 case lookupUFM reservedSymsFM fs of
716 Just (keyword,0) -> return (L span keyword)
717 Just (keyword,exts) -> do
718 b <- extension (\i -> exts .&. i /= 0)
719 if b then return (L span keyword)
720 else return (L span $! con fs)
721 _other -> return (L span $! con fs)
723 fs = lexemeToFastString buf len
725 tok_decimal span buf len
726 = return (L span (ITinteger $! parseInteger buf len 10 octDecDigit))
728 tok_octal span buf len
729 = return (L span (ITinteger $! parseInteger (stepOnBy 2 buf) (len-2) 8 octDecDigit))
731 tok_hexadecimal span buf len
732 = return (L span (ITinteger $! parseInteger (stepOnBy 2 buf) (len-2) 16 hexDigit))
734 prim_decimal span buf len
735 = return (L span (ITprimint $! parseInteger buf (len-1) 10 octDecDigit))
737 prim_octal span buf len
738 = return (L span (ITprimint $! parseInteger (stepOnBy 2 buf) (len-3) 8 octDecDigit))
740 prim_hexadecimal span buf len
741 = return (L span (ITprimint $! parseInteger (stepOnBy 2 buf) (len-3) 16 hexDigit))
743 tok_float str = ITrational $! readRational str
744 prim_float str = ITprimfloat $! readRational str
745 prim_double str = ITprimdouble $! readRational str
747 -- -----------------------------------------------------------------------------
750 -- we're at the first token on a line, insert layout tokens if necessary
752 do_bol span _str _len = do
756 --trace "layout: inserting '}'" $ do
758 -- do NOT pop the lex state, we might have a ';' to insert
759 return (L span ITvccurly)
761 --trace "layout: inserting ';'" $ do
763 return (L span ITsemi)
768 -- certain keywords put us in the "layout" state, where we might
769 -- add an opening curly brace.
770 maybe_layout ITdo = pushLexState layout_do
771 maybe_layout ITmdo = pushLexState layout_do
772 maybe_layout ITof = pushLexState layout
773 maybe_layout ITlet = pushLexState layout
774 maybe_layout ITwhere = pushLexState layout
775 maybe_layout ITrec = pushLexState layout
776 maybe_layout _ = return ()
778 -- Pushing a new implicit layout context. If the indentation of the
779 -- next token is not greater than the previous layout context, then
780 -- Haskell 98 says that the new layout context should be empty; that is
781 -- the lexer must generate {}.
783 -- We are slightly more lenient than this: when the new context is started
784 -- by a 'do', then we allow the new context to be at the same indentation as
785 -- the previous context. This is what the 'strict' argument is for.
787 new_layout_context strict span _buf _len = do
789 (AI _ offset _) <- getInput
792 Layout prev_off : _ |
793 (strict && prev_off >= offset ||
794 not strict && prev_off > offset) -> do
795 -- token is indented to the left of the previous context.
796 -- we must generate a {} sequence now.
797 pushLexState layout_left
798 return (L span ITvocurly)
800 setContext (Layout offset : ctx)
801 return (L span ITvocurly)
803 do_layout_left span _buf _len = do
805 pushLexState bol -- we must be at the start of a line
806 return (L span ITvccurly)
808 -- -----------------------------------------------------------------------------
811 setLine :: Int -> Action
812 setLine code span buf len = do
813 let line = parseInteger buf len 10 octDecDigit
814 setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
815 -- subtract one: the line number refers to the *following* line
820 setFile :: Int -> Action
821 setFile code span buf len = do
822 let file = lexemeToFastString (stepOn buf) (len-2)
823 setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
828 -- -----------------------------------------------------------------------------
831 -- This stuff is horrible. I hates it.
833 lex_string_tok :: Action
834 lex_string_tok span buf len = do
837 return (L (mkSrcSpan (srcSpanStart span) end) tok)
839 lex_string :: String -> P Token
842 case alexGetChar i of
847 glaexts <- extension glaExtsEnabled
851 case alexGetChar i of
855 then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
856 else let s' = mkFastStringNarrow (reverse s) in
857 -- always a narrow string/byte array
858 return (ITprimstring s')
860 return (ITstring (mkFastString (reverse s)))
862 return (ITstring (mkFastString (reverse s)))
865 | Just ('&',i) <- next -> do
866 setInput i; lex_string s
867 | Just (c,i) <- next, is_space c -> do
868 setInput i; lex_stringgap s
869 where next = alexGetChar i
879 c | is_space c -> lex_stringgap s
883 lex_char_tok :: Action
884 -- Here we are basically parsing character literals, such as 'x' or '\n'
885 -- but, when Template Haskell is on, we additionally spot
886 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
887 -- but WIHTOUT CONSUMING the x or T part (the parser does that).
888 -- So we have to do two characters of lookahead: when we see 'x we need to
889 -- see if there's a trailing quote
890 lex_char_tok span buf len = do -- We've seen '
891 i1 <- getInput -- Look ahead to first character
892 let loc = srcSpanStart span
893 case alexGetChar i1 of
896 Just ('\'', i2@(AI end2 _ _)) -> do -- We've seen ''
897 th_exts <- extension thEnabled
900 return (L (mkSrcSpan loc end2) ITtyQuote)
903 Just ('\\', i2@(AI end2 _ _)) -> do -- We've seen 'backslash
906 mc <- getCharOrFail -- Trailing quote
907 if mc == '\'' then finish_char_tok loc lit_ch
910 Just (c, i2@(AI end2 _ _)) | not (is_any c) -> lit_error
913 -- We've seen 'x, where x is a valid character
914 -- (i.e. not newline etc) but not a quote or backslash
915 case alexGetChar i2 of -- Look ahead one more character
917 Just ('\'', i3) -> do -- We've seen 'x'
919 finish_char_tok loc c
920 _other -> do -- We've seen 'x not followed by quote
921 -- If TH is on, just parse the quote only
922 th_exts <- extension thEnabled
923 let (AI end _ _) = i1
924 if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
927 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
928 finish_char_tok loc ch -- We've already seen the closing quote
929 -- Just need to check for trailing #
930 = do glaexts <- extension glaExtsEnabled
931 i@(AI end _ _) <- getInput
933 case alexGetChar i of
934 Just ('#',i@(AI end _ _)) -> do
936 return (L (mkSrcSpan loc end) (ITprimchar ch))
938 return (L (mkSrcSpan loc end) (ITchar ch))
940 return (L (mkSrcSpan loc end) (ITchar ch))
947 c | is_any c -> return c
964 '^' -> do c <- getCharOrFail
965 if c >= '@' && c <= '_'
966 then return (chr (ord c - ord '@'))
969 'x' -> readNum is_hexdigit 16 hexDigit
970 'o' -> readNum is_octdigit 8 octDecDigit
971 x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
975 case alexGetChar i of
978 case alexGetChar i2 of
981 let str = [c1,c2,c3] in
982 case [ (c,rest) | (p,c) <- silly_escape_chars,
983 Just rest <- [maybePrefixMatch p str] ] of
984 (escape_char,[]):_ -> do
987 (escape_char,_:_):_ -> do
992 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
993 readNum is_digit base conv = do
996 then readNum2 is_digit base conv (conv c)
999 readNum2 is_digit base conv i = do
1002 where read i input = do
1003 case alexGetChar input of
1004 Just (c,input') | is_digit c -> do
1005 read (i*base + conv c) input'
1008 if i >= 0 && i <= 0x10FFFF
1012 silly_escape_chars = [
1049 lit_error = lexError "lexical error in string/character literal"
1051 getCharOrFail :: P Char
1054 case alexGetChar i of
1055 Nothing -> lexError "unexpected end-of-file in string/character literal"
1056 Just (c,i) -> do setInput i; return c
1058 -- -----------------------------------------------------------------------------
1068 SrcSpan -- The start and end of the text span related to
1069 -- the error. Might be used in environments which can
1070 -- show this span, e.g. by highlighting it.
1071 Message -- The error message
1073 data PState = PState {
1074 buffer :: StringBuffer,
1075 last_loc :: SrcSpan, -- pos of previous token
1076 last_offs :: !Int, -- offset of the previous token from the
1077 -- beginning of the current line.
1078 -- \t is equal to 8 spaces.
1079 last_len :: !Int, -- len of previous token
1080 loc :: SrcLoc, -- current loc (end of prev token + 1)
1081 extsBitmap :: !Int, -- bitmap that determines permitted extensions
1082 context :: [LayoutContext],
1085 -- last_loc and last_len are used when generating error messages,
1086 -- and in pushCurrentContext only. Sigh, if only Happy passed the
1087 -- current token to happyError, we could at least get rid of last_len.
1088 -- Getting rid of last_loc would require finding another way to
1089 -- implement pushCurrentContext (which is only called from one place).
1091 newtype P a = P { unP :: PState -> ParseResult a }
1093 instance Monad P where
1099 returnP a = P $ \s -> POk s a
1101 thenP :: P a -> (a -> P b) -> P b
1102 (P m) `thenP` k = P $ \ s ->
1104 POk s1 a -> (unP (k a)) s1
1105 PFailed span err -> PFailed span err
1107 failP :: String -> P a
1108 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1110 failMsgP :: String -> P a
1111 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1113 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1114 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1116 failSpanMsgP :: SrcSpan -> String -> P a
1117 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1119 extension :: (Int -> Bool) -> P Bool
1120 extension p = P $ \s -> POk s (p $! extsBitmap s)
1123 getExts = P $ \s -> POk s (extsBitmap s)
1125 setSrcLoc :: SrcLoc -> P ()
1126 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1128 getSrcLoc :: P SrcLoc
1129 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1131 setLastToken :: SrcSpan -> Int -> P ()
1132 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1134 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1136 alexInputPrevChar :: AlexInput -> Char
1137 alexInputPrevChar (AI _ _ s) = prevChar s '\n'
1139 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1140 alexGetChar (AI loc ofs s)
1142 | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq` Just (c, (AI loc' ofs' s'))
1143 where c = currentChar s
1144 loc' = advanceSrcLoc loc c
1145 ofs' = advanceOffs c ofs
1148 advanceOffs :: Char -> Int -> Int
1149 advanceOffs '\n' offs = 0
1150 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1151 advanceOffs _ offs = offs + 1
1153 getInput :: P AlexInput
1154 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1156 setInput :: AlexInput -> P ()
1157 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1159 pushLexState :: Int -> P ()
1160 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1162 popLexState :: P Int
1163 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1165 getLexState :: P Int
1166 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1168 -- for reasons of efficiency, flags indicating language extensions (eg,
1169 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1172 glaExtsBit, ffiBit, parrBit :: Int
1179 tvBit = 7 -- Scoped type variables enables 'forall' keyword
1181 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1182 glaExtsEnabled flags = testBit flags glaExtsBit
1183 ffiEnabled flags = testBit flags ffiBit
1184 parrEnabled flags = testBit flags parrBit
1185 arrowsEnabled flags = testBit flags arrowsBit
1186 thEnabled flags = testBit flags thBit
1187 ipEnabled flags = testBit flags ipBit
1188 tvEnabled flags = testBit flags tvBit
1190 -- create a parse state
1192 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1193 mkPState buf loc flags =
1196 last_loc = mkSrcSpan loc loc,
1200 extsBitmap = fromIntegral bitmap,
1202 lex_state = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1203 -- we begin in the layout state if toplev_layout is set
1206 bitmap = glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1207 .|. ffiBit `setBitIf` dopt Opt_FFI flags
1208 .|. parrBit `setBitIf` dopt Opt_PArr flags
1209 .|. arrowsBit `setBitIf` dopt Opt_Arrows flags
1210 .|. thBit `setBitIf` dopt Opt_TH flags
1211 .|. ipBit `setBitIf` dopt Opt_ImplicitParams flags
1212 .|. tvBit `setBitIf` dopt Opt_ScopedTypeVariables flags
1214 setBitIf :: Int -> Bool -> Int
1215 b `setBitIf` cond | cond = bit b
1218 getContext :: P [LayoutContext]
1219 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1221 setContext :: [LayoutContext] -> P ()
1222 setContext ctx = P $ \s -> POk s{context=ctx} ()
1225 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1226 loc = loc, last_len = len, last_loc = last_loc }) ->
1228 (_:tl) -> POk s{ context = tl } ()
1229 [] -> PFailed last_loc (srcParseErr buf len)
1231 -- Push a new layout context at the indentation of the last token read.
1232 -- This is only used at the outer level of a module when the 'module'
1233 -- keyword is missing.
1234 pushCurrentContext :: P ()
1235 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1236 POk s{context = Layout (offs-len) : ctx} ()
1238 getOffside :: P Ordering
1239 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1240 let ord = case stk of
1241 (Layout n:_) -> compare offs n
1245 -- ---------------------------------------------------------------------------
1246 -- Construct a parse error
1249 :: StringBuffer -- current buffer (placed just after the last token)
1250 -> Int -- length of the previous token
1253 = hcat [ if null token
1254 then ptext SLIT("parse error (possibly incorrect indentation)")
1255 else hcat [ptext SLIT("parse error on input "),
1256 char '`', text token, char '\'']
1258 where token = lexemeToString (stepOnBy (-len) buf) len
1260 -- Report a parse failure, giving the span of the previous token as
1261 -- the location of the error. This is the entry point for errors
1262 -- detected during parsing.
1264 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1265 last_loc = last_loc } ->
1266 PFailed last_loc (srcParseErr buf len)
1268 -- A lexical error is reported at a particular position in the source file,
1269 -- not over a token range. TODO: this is slightly wrong, because we record
1270 -- the error at the character position following the one which caused the
1271 -- error. We should somehow back up by one character.
1272 lexError :: String -> P a
1275 i@(AI end _ _) <- getInput
1276 failLocMsgP loc end str
1278 -- -----------------------------------------------------------------------------
1279 -- This is the top-level function: called from the parser each time a
1280 -- new token is to be read from the input.
1282 lexer :: (Located Token -> P a) -> P a
1284 tok@(L _ tok__) <- lexToken
1285 -- trace ("token: " ++ show tok__) $ do
1288 lexToken :: P (Located Token)
1290 inp@(AI loc1 _ buf) <- getInput
1293 case alexScanUser exts inp sc of
1294 AlexEOF -> do let span = mkSrcSpan loc1 loc1
1296 return (L span ITeof)
1297 AlexError (AI loc2 _ _) -> do failLocMsgP loc1 loc2 "lexical error"
1298 AlexSkip inp2 _ -> do
1301 AlexToken inp2@(AI end _ buf2) len t -> do
1303 let span = mkSrcSpan loc1 end
1304 span `seq` setLastToken span len