From 05aa4a18f05dd7159ac656bba8f52157357394d9 Mon Sep 17 00:00:00 2001 From: simonmar Date: Mon, 8 Sep 2003 11:53:14 +0000 Subject: [PATCH] [project @ 2003-09-08 11:53:14 by simonmar] Remove old lexer; replaced by Lexer.x. --- ghc/compiler/parser/Lex.lhs | 1379 ------------------------------------------- 1 file changed, 1379 deletions(-) delete mode 100644 ghc/compiler/parser/Lex.lhs diff --git a/ghc/compiler/parser/Lex.lhs b/ghc/compiler/parser/Lex.lhs deleted file mode 100644 index 5be189c..0000000 --- a/ghc/compiler/parser/Lex.lhs +++ /dev/null @@ -1,1379 +0,0 @@ -% -% (c) The GRASP/AQUA Project, Glasgow University, 1992-1998 -% -\section[Lexical analysis]{Lexical analysis} - --------------------------------------------------------- -[Jan 98] -There's a known bug in here: - - If an interface file ends prematurely, Lex tries to - do headFS of an empty FastString. - -An example that provokes the error is - - f _:_ _forall_ [a] <<>> --------------------------------------------------------- - -\begin{code} -module Lex ( - - srcParseErr, - - -- Monad for parser - Token(..), lexer, ParseResult(..), PState(..), - ExtFlags(..), mkPState, - StringBuffer, - - P, thenP, thenP_, returnP, mapP, failP, failMsgP, - getSrcLocP, setSrcLocP, getSrcFile, - layoutOn, layoutOff, pushContext, popContext - ) where - -#include "HsVersions.h" - -import Char ( toUpper, isDigit, chr, ord ) -import Ratio ( (%) ) - -import PrelNames ( mkTupNameStr ) -import ForeignCall ( Safety(..) ) -import UniqFM ( listToUFM, lookupUFM ) -import BasicTypes ( Boxity(..) ) -import SrcLoc ( SrcLoc, incSrcLine, srcLocFile, srcLocLine, - replaceSrcLine, mkSrcLoc ) - -import ErrUtils ( Message ) -import Outputable - -import FastString -import StringBuffer -import Ctype - -import GLAEXTS -import DATA_BITS ( Bits(..) ) -import DATA_INT ( Int32 ) -\end{code} - -%************************************************************************ -%* * -\subsection{Data types} -%* * -%************************************************************************ - -The token data type, fairly un-interesting except from one -constructor, @ITidinfo@, which is used to lazily lex id info (arity, -strictness, unfolding etc). - -The Idea/Observation here is that the renamer needs to scan through -all of an interface file before it can continue. But only a fraction -of the information contained in the file turns out to be useful, so -delaying as much as possible of the scanning and parsing of an -interface file Makes Sense (Heap profiles of the compiler -show a reduction in heap usage by at least a factor of two, -post-renamer). - -Hence, the interface file lexer spots when value declarations are -being scanned and return the @ITidinfo@ and @ITtype@ constructors -for the type and any other id info for that binding (unfolding, strictness -etc). These constructors are applied to the result of lexing these sub-chunks. - -The lexing of the type and id info is all done lazily, of course, so -the scanning (and subsequent parsing) will be done *only* on the ids the -renamer finds out that it is interested in. The rest will just be junked. -Laziness, you know it makes sense :-) - -\begin{code} -data Token - = ITas -- Haskell keywords - | ITcase - | ITclass - | ITdata - | ITdefault - | ITderiving - | ITdo - | ITelse - | IThiding - | ITif - | ITimport - | ITin - | ITinfix - | ITinfixl - | ITinfixr - | ITinstance - | ITlet - | ITmodule - | ITnewtype - | ITof - | ITqualified - | ITthen - | ITtype - | ITwhere - | ITscc -- ToDo: remove (we use {-# SCC "..." #-} now) - - | ITforall -- GHC extension keywords - | ITforeign - | ITexport - | ITlabel - | ITdynamic - | ITsafe - | ITthreadsafe - | ITunsafe - | ITwith - | ITstdcallconv - | ITccallconv - | ITdotnet - | ITccall (Bool,Bool,Safety) -- (is_dyn, is_casm, may_gc) - | ITmdo - - | ITspecialise_prag -- Pragmas - | ITsource_prag - | ITinline_prag - | ITnoinline_prag - | ITrules_prag - | ITdeprecated_prag - | ITline_prag - | ITscc_prag - | ITcore_prag -- hdaume: core annotations - | ITclose_prag - - | ITdotdot -- reserved symbols - | ITcolon - | ITdcolon - | ITequal - | ITlam - | ITvbar - | ITlarrow - | ITrarrow - | ITat - | ITtilde - | ITdarrow - | ITminus - | ITbang - | ITstar - | ITdot - - | ITbiglam -- GHC-extension symbols - - | ITocurly -- special symbols - | ITccurly - | ITocurlybar -- {|, for type applications - | ITccurlybar -- |}, for type applications - | ITvccurly - | ITobrack - | ITopabrack -- [:, for parallel arrays with -fparr - | ITcpabrack -- :], for parallel arrays with -fparr - | ITcbrack - | IToparen - | ITcparen - | IToubxparen - | ITcubxparen - | ITsemi - | ITcomma - | ITunderscore - | ITbackquote - - | ITvarid FastString -- identifiers - | ITconid FastString - | ITvarsym FastString - | ITconsym FastString - | ITqvarid (FastString,FastString) - | ITqconid (FastString,FastString) - | ITqvarsym (FastString,FastString) - | ITqconsym (FastString,FastString) - - | ITdupipvarid FastString -- GHC extension: implicit param: ?x - | ITsplitipvarid FastString -- GHC extension: implicit param: %x - - | ITpragma StringBuffer - - | ITchar Int - | ITstring FastString - | ITinteger Integer - | ITrational Rational - - | ITprimchar Int - | ITprimstring FastString - | ITprimint Integer - | ITprimfloat Rational - | ITprimdouble Rational - | ITlitlit FastString - - -- MetaHaskell extension tokens - | ITopenExpQuote -- [| or [e| - | ITopenPatQuote -- [p| - | ITopenDecQuote -- [d| - | ITopenTypQuote -- [t| - | ITcloseQuote -- |] - | ITidEscape FastString -- $x - | ITparenEscape -- $( - | ITreifyType - | ITreifyDecl - | ITreifyFixity - - -- Arrow notation extension - | ITproc - | ITrec - | IToparenbar -- (| - | ITcparenbar -- |) - | ITlarrowtail -- -< - | ITrarrowtail -- >- - | ITLarrowtail -- -<< - | ITRarrowtail -- >>- - - | ITunknown String -- Used when the lexer can't make sense of it - | ITeof -- end of file token - deriving Show -- debugging -\end{code} - ------------------------------------------------------------------------------ -Keyword Lists - -\begin{code} -pragmaKeywordsFM = listToUFM $ - map (\ (x,y) -> (mkFastString x,y)) - [( "SPECIALISE", ITspecialise_prag ), - ( "SPECIALIZE", ITspecialise_prag ), - ( "SOURCE", ITsource_prag ), - ( "INLINE", ITinline_prag ), - ( "NOINLINE", ITnoinline_prag ), - ( "NOTINLINE", ITnoinline_prag ), - ( "LINE", ITline_prag ), - ( "RULES", ITrules_prag ), - ( "RULEZ", ITrules_prag ), -- american spelling :-) - ( "SCC", ITscc_prag ), - ( "CORE", ITcore_prag ), -- hdaume: core annotation - ( "DEPRECATED", ITdeprecated_prag ) - ] - -haskellKeywordsFM = listToUFM $ - map (\ (x,y) -> (mkFastString x,y)) - [( "_", ITunderscore ), - ( "as", ITas ), - ( "case", ITcase ), - ( "class", ITclass ), - ( "data", ITdata ), - ( "default", ITdefault ), - ( "deriving", ITderiving ), - ( "do", ITdo ), - ( "else", ITelse ), - ( "hiding", IThiding ), - ( "if", ITif ), - ( "import", ITimport ), - ( "in", ITin ), - ( "infix", ITinfix ), - ( "infixl", ITinfixl ), - ( "infixr", ITinfixr ), - ( "instance", ITinstance ), - ( "let", ITlet ), - ( "module", ITmodule ), - ( "newtype", ITnewtype ), - ( "of", ITof ), - ( "qualified", ITqualified ), - ( "then", ITthen ), - ( "type", ITtype ), - ( "where", ITwhere ), - ( "_scc_", ITscc ) -- ToDo: remove - ] - -isSpecial :: Token -> Bool --- If we see M.x, where x is a keyword, but --- is special, we treat is as just plain M.x, --- not as a keyword. -isSpecial ITas = True -isSpecial IThiding = True -isSpecial ITqualified = True -isSpecial ITforall = True -isSpecial ITexport = True -isSpecial ITlabel = True -isSpecial ITdynamic = True -isSpecial ITsafe = True -isSpecial ITthreadsafe = True -isSpecial ITunsafe = True -isSpecial ITwith = True -isSpecial ITccallconv = True -isSpecial ITstdcallconv = True -isSpecial ITmdo = True -isSpecial _ = False - --- the bitmap provided as the third component indicates whether the --- corresponding extension keyword is valid under the extension options --- provided to the compiler; if the extension corresponding to *any* of the --- bits set in the bitmap is enabled, the keyword is valid (this setup --- facilitates using a keyword in two different extensions that can be --- activated independently) --- -ghcExtensionKeywordsFM = listToUFM $ - map (\(x, y, z) -> (mkFastString x, (y, z))) - [ ( "forall", ITforall, bit glaExtsBit), - ( "mdo", ITmdo, bit glaExtsBit), - ( "reifyDecl", ITreifyDecl, bit glaExtsBit), - ( "reifyType", ITreifyType, bit glaExtsBit), - ( "reifyFixity",ITreifyFixity, bit glaExtsBit), - - ( "foreign", ITforeign, bit ffiBit), - ( "export", ITexport, bit ffiBit), - ( "label", ITlabel, bit ffiBit), - ( "dynamic", ITdynamic, bit ffiBit), - ( "safe", ITsafe, bit ffiBit), - ( "threadsafe", ITthreadsafe, bit ffiBit), - ( "unsafe", ITunsafe, bit ffiBit), - ( "stdcall", ITstdcallconv, bit ffiBit), - ( "ccall", ITccallconv, bit ffiBit), - ( "dotnet", ITdotnet, bit ffiBit), - - ( "with", ITwith, bit withBit), - - ( "rec", ITrec, bit arrowsBit), - ( "proc", ITproc, bit arrowsBit), - - -- On death row - ("_ccall_", ITccall (False, False, PlayRisky), - bit glaExtsBit), - ("_ccall_GC_", ITccall (False, False, PlaySafe False), - bit glaExtsBit), - ("_casm_", ITccall (False, True, PlayRisky), - bit glaExtsBit), - ("_casm_GC_", ITccall (False, True, PlaySafe False), - bit glaExtsBit) - ] - -haskellKeySymsFM = listToUFM $ - map (\ (x,y,z) -> (mkFastString x,(y,z))) - [ ("..", ITdotdot, Nothing) - ,(":", ITcolon, Nothing) -- (:) is a reserved op, - -- meaning only list cons - ,("::", ITdcolon, Nothing) - ,("=", ITequal, Nothing) - ,("\\", ITlam, Nothing) - ,("|", ITvbar, Nothing) - ,("<-", ITlarrow, Nothing) - ,("->", ITrarrow, Nothing) - ,("@", ITat, Nothing) - ,("~", ITtilde, Nothing) - ,("=>", ITdarrow, Nothing) - ,("-", ITminus, Nothing) - ,("!", ITbang, Nothing) - - ,("*", ITstar, Just (bit glaExtsBit)) -- For data T (a::*) = MkT - ,(".", ITdot, Just (bit glaExtsBit)) -- For 'forall a . t' - - ,("-<", ITlarrowtail, Just (bit arrowsBit)) - ,(">-", ITrarrowtail, Just (bit arrowsBit)) - ,("-<<", ITLarrowtail, Just (bit arrowsBit)) - ,(">>-", ITRarrowtail, Just (bit arrowsBit)) - ] - -\end{code} - ------------------------------------------------------------------------------ -The lexical analyser - -Lexer state: - - - (exts) lexing a source with extensions, eg, an interface file or - with -fglasgow-exts - - (bol) pointer to beginning of line (for column calculations) - - (buf) pointer to beginning of token - - (buf) pointer to current char - - (atbol) flag indicating whether we're at the beginning of a line - -\begin{code} -lexer :: (Token -> P a) -> P a -lexer cont buf s@(PState{ - loc = loc, - extsBitmap = exts, - bol = bol, - atbol = atbol, - context = ctx - }) - - -- first, start a new lexeme and lose all the whitespace - = _scc_ "Lexer" - tab line bol atbol (stepOverLexeme buf) - where - line = srcLocLine loc - - tab y bol atbol buf = --trace ("tab: " ++ show (I# y) ++ " : " ++ show (currentChar buf)) $ - case currentChar# buf of - - '\NUL'# -> - if bufferExhausted (stepOn buf) - then cont ITeof buf s' - else trace "lexer: misplaced NUL?" $ - tab y bol atbol (stepOn buf) - - '\n'# -> let buf' = stepOn buf - in tab (y +# 1#) (currentIndex# buf') 1# buf' - - -- find comments. This got harder in Haskell 98. - '-'# -> let trundle n = - let next = lookAhead# buf n in - if next `eqChar#` '-'# then trundle (n +# 1#) - else if is_symbol next || n <# 2# - then is_a_token - else tab y bol atbol - (stepOnUntilChar# (stepOnBy# buf n) '\n'#) - in trundle 1# - - -- comments and pragmas. We deal with LINE pragmas here, - -- and throw out any unrecognised pragmas as comments. Any - -- pragmas we know about are dealt with later (after any layout - -- processing if necessary). - '{'# | lookAhead# buf 1# `eqChar#` '-'# -> - if lookAhead# buf 2# `eqChar#` '#'# then - case expandWhile# is_space (addToCurrentPos buf 3#) of { buf1-> - case expandWhile# is_ident (stepOverLexeme buf1) of { buf2-> - let lexeme = mkFastString -- ToDo: too slow - (map toUpper (lexemeToString buf2)) in - case lookupUFM pragmaKeywordsFM lexeme of - -- ignore RULES pragmas when -fglasgow-exts is off - Just ITrules_prag | not (glaExtsEnabled exts) -> - skip_to_end (stepOnBy# buf 2#) s' - Just ITline_prag -> - line_prag skip_to_end buf2 s' - Just other -> is_a_token - Nothing -> skip_to_end (stepOnBy# buf 2#) s' - }} - - else skip_to_end (stepOnBy# buf 2#) s' - where - skip_to_end = skipNestedComment (lexer cont) - - -- special GHC extension: we grok cpp-style #line pragmas - '#'# | lexemeIndex buf ==# bol -> -- the '#' must be in column 0 - -- SPECIAL CASE: if we see "#!" at the beginning of the line, - -- we ignore the rest of the line. This is for script-files - -- on Unix which begin with the special syntax "#! /bin/sh", - -- for example. - if lookAhead# buf 1# `eqChar#` '!'# - then next_line (stepOnBy# buf 2#) s' - else - let buf1 | lookAhead# buf 1# `eqChar#` 'l'# && - lookAhead# buf 2# `eqChar#` 'i'# && - lookAhead# buf 3# `eqChar#` 'n'# && - lookAhead# buf 4# `eqChar#` 'e'# = stepOnBy# buf 5# - | otherwise = stepOn buf - in - case expandWhile# is_space buf1 of { buf2 -> - if is_digit (currentChar# buf2) - then line_prag next_line buf2 s' - else is_a_token - } - where - next_line buf = lexer cont (stepOnUntilChar# buf '\n'#) - - -- tabs have been expanded beforehand - c | is_space c -> tab y bol atbol (stepOn buf) - | otherwise -> is_a_token - - where s' = s{loc = replaceSrcLine loc y, - bol = bol, - atbol = atbol} - - is_a_token | atbol /=# 0# = lexBOL cont buf s' - | otherwise = lexToken cont exts buf s' - --- {-# LINE .. #-} pragmas. yeuch. -line_prag cont buf s@PState{loc=loc} = - case expandWhile# is_space buf of { buf1 -> - case scanNumLit 0 (stepOverLexeme buf1) of { (line,buf2) -> - -- subtract one: the line number refers to the *following* line. - let real_line = line - 1 in - case fromInteger real_line of { i@(I# l) -> - -- ToDo, if no filename then we skip the newline.... d'oh - case expandWhile# is_space buf2 of { buf3 -> - case currentChar# buf3 of - '\"'#{-"-} -> - case untilEndOfString# (stepOn (stepOverLexeme buf3)) of { buf4 -> - let - file = lexemeToFastString buf4 - new_buf = stepOn (stepOverLexeme buf4) - in - if nullFastString file - then cont new_buf s{loc = replaceSrcLine loc l} - else cont new_buf s{loc = mkSrcLoc file i} - } - _other -> cont (stepOverLexeme buf3) s{loc = replaceSrcLine loc l} - }}}} - -skipNestedComment :: P a -> P a -skipNestedComment cont buf state = skipNestedComment' (loc state) cont buf state - -skipNestedComment' :: SrcLoc -> P a -> P a -skipNestedComment' orig_loc cont buf = loop buf - where - loop buf = - case currentChar# buf of - '-'# | lookAhead# buf 1# `eqChar#` '}'# -> cont (stepOnBy# buf 2#) - - '{'# | lookAhead# buf 1# `eqChar#` '-'# -> - skipNestedComment - (skipNestedComment' orig_loc cont) - (stepOnBy# buf 2#) - - '\n'# -> \ s@PState{loc=loc} -> - let buf' = stepOn buf in - loop buf' s{loc = incSrcLine loc, - bol = currentIndex# buf', - atbol = 1#} - - -- pass the original SrcLoc to lexError so that the error is - -- reported at the line it was originally on, not the line at - -- the end of the file. - '\NUL'# | bufferExhausted (stepOn buf) -> - \s -> lexError "unterminated `{-'" buf s{loc=orig_loc} -- -} - - _ -> loop (stepOn buf) - --- When we are lexing the first token of a line, check whether we need to --- insert virtual semicolons or close braces due to layout. - -lexBOL :: (Token -> P a) -> P a -lexBOL cont buf s@(PState{ - loc = loc, - extsBitmap = exts, - bol = bol, - atbol = atbol, - context = ctx - }) = - if need_close_curly then - --trace ("col = " ++ show (I# col) ++ ", layout: inserting '}'") $ - cont ITvccurly buf s{atbol = 1#, context = tail ctx} - else if need_semi_colon then - --trace ("col = " ++ show (I# col) ++ ", layout: inserting ';'") $ - cont ITsemi buf s{atbol = 0#} - else - lexToken cont exts buf s{atbol = 0#} - where - col = currentIndex# buf -# bol - - need_close_curly = - case ctx of - [] -> False - (i:_) -> case i of - NoLayout -> False - Layout n -> col <# n - need_semi_colon = - case ctx of - [] -> False - (i:_) -> case i of - NoLayout -> False - Layout n -> col ==# n - - -lexToken :: (Token -> P a) -> Int# -> P a -lexToken cont exts buf = --- trace "lexToken" $ - case currentChar# buf of - - -- special symbols ---------------------------------------------------- - '('# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '#'# && - -- Unboxed tules: '(#' but not '(##' - not (lookAhead# buf 2# `eqChar#` '#'#) - -> cont IToubxparen (addToCurrentPos buf 2#) - -- Arrow notation extension: '(|' but not '(||' - | arrowsEnabled exts && lookAhead# buf 1# `eqChar#` '|'# && - not (lookAhead# buf 2# `eqChar#` '|'#) - -> cont IToparenbar (addToCurrentPos buf 2#) - | otherwise - -> cont IToparen (incCurrentPos buf) - - ')'# -> cont ITcparen (incCurrentPos buf) - '['# | parrEnabled exts && lookAhead# buf 1# `eqChar#` ':'# -> - cont ITopabrack (addToCurrentPos buf 2#) - ------- MetaHaskell Extensions, looking for [| [e| [t| [p| and [d| - | glaExtsEnabled exts && - ((lookAhead# buf 1# ) `eqChar#` '|'# ) -> - cont ITopenExpQuote (addToCurrentPos buf 2# ) - | glaExtsEnabled exts && - (let c = (lookAhead# buf 1# ) - in eqChar# c 'e'# || eqChar# c 't'# || eqChar# c 'd'# || eqChar# c 'p'#) && - ((lookAhead# buf 2#) `eqChar#` '|'#) -> - let quote 'e'# = ITopenExpQuote - quote 'p'# = ITopenPatQuote - quote 'd'# = ITopenDecQuote - quote 't'# = ITopenTypQuote - in cont (quote (lookAhead# buf 1#)) (addToCurrentPos buf 3# ) - | otherwise -> - cont ITobrack (incCurrentPos buf) - - ']'# -> cont ITcbrack (incCurrentPos buf) - ','# -> cont ITcomma (incCurrentPos buf) - ';'# -> cont ITsemi (incCurrentPos buf) - '}'# -> \ s@PState{context = ctx} -> - case ctx of - (_:ctx') -> cont ITccurly (incCurrentPos buf) s{context=ctx'} - _ -> lexError "too many '}'s" buf s - '|'# -> case lookAhead# buf 1# of - '}'# | glaExtsEnabled exts -> cont ITccurlybar - (addToCurrentPos buf 2#) - -- MetaHaskell extension - ']'# | glaExtsEnabled exts -> cont ITcloseQuote (addToCurrentPos buf 2#) - -- arrow notation extension - ')'# | arrowsEnabled exts -> cont ITcparenbar - (addToCurrentPos buf 2#) - other -> lex_sym cont exts (incCurrentPos buf) - ':'# -> case lookAhead# buf 1# of - ']'# | parrEnabled exts -> cont ITcpabrack - (addToCurrentPos buf 2#) - _ -> lex_sym cont exts (incCurrentPos buf) - - - '#'# -> case lookAhead# buf 1# of - ')'# | glaExtsEnabled exts - -> cont ITcubxparen (addToCurrentPos buf 2#) - '-'# -> case lookAhead# buf 2# of - '}'# -> cont ITclose_prag (addToCurrentPos buf 3#) - _ -> lex_sym cont exts (incCurrentPos buf) - _ -> lex_sym cont exts (incCurrentPos buf) - - '`'# | glaExtsEnabled exts && lookAhead# buf 1# `eqChar#` '`'# - -> lex_cstring cont (addToCurrentPos buf 2#) - | otherwise - -> cont ITbackquote (incCurrentPos buf) - - '{'# -> -- for Emacs: -} - case lookAhead# buf 1# of - '|'# | glaExtsEnabled exts - -> cont ITocurlybar (addToCurrentPos buf 2#) - '-'# -> case lookAhead# buf 2# of - '#'# -> lex_prag cont (addToCurrentPos buf 3#) - _ -> cont ITocurly (incCurrentPos buf) - _ -> (layoutOff `thenP_` cont ITocurly) (incCurrentPos buf) - - - - - -- strings/characters ------------------------------------------------- - '\"'#{-"-} -> lex_string cont exts [] (incCurrentPos buf) - '\''# -> lex_char (char_end cont) exts (incCurrentPos buf) - - -- Hexadecimal and octal constants - '0'# | (ch `eqChar#` 'x'# || ch `eqChar#` 'X'#) && is_hexdigit ch2 - -> readNum (after_lexnum cont exts) buf' is_hexdigit 16 hex - | (ch `eqChar#` 'o'# || ch `eqChar#` 'O'#) && is_octdigit ch2 - -> readNum (after_lexnum cont exts) buf' is_octdigit 8 oct_or_dec - where ch = lookAhead# buf 1# - ch2 = lookAhead# buf 2# - buf' = addToCurrentPos buf 2# - - '\NUL'# -> - if bufferExhausted (stepOn buf) then - cont ITeof buf - else - trace "lexIface: misplaced NUL?" $ - cont (ITunknown "\NUL") (stepOn buf) - - '?'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) -> -- ?x implicit parameter - specialPrefixId ITdupipvarid cont exts (incCurrentPos buf) - '%'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) -> - specialPrefixId ITsplitipvarid cont exts (incCurrentPos buf) - - ---------------- MetaHaskell Extensions for quotation escape - '$'# | glaExtsEnabled exts && is_lower (lookAhead# buf 1#) -> -- $x variable escape - specialPrefixId ITidEscape cont exts (addToCurrentPos buf 1#) - '$'# | glaExtsEnabled exts && -- $( f x ) expression escape - ((lookAhead# buf 1#) `eqChar#` '('#) -> cont ITparenEscape (addToCurrentPos buf 2#) - - c | is_digit c -> lex_num cont exts 0 buf - | is_symbol c -> lex_sym cont exts buf - | is_upper c -> lex_con cont exts buf - | is_lower c -> lex_id cont exts buf - | otherwise -> lexError "illegal character" buf - --- Int# is unlifted, and therefore faster than Bool for flags. -{-# INLINE flag #-} -flag :: Int# -> Bool -flag 0# = False -flag _ = True - -------------------------------------------------------------------------------- --- Pragmas - -lex_prag cont buf - = case expandWhile# is_space buf of { buf1 -> - case expandWhile# is_ident (stepOverLexeme buf1) of { buf2 -> - let lexeme = mkFastString (map toUpper (lexemeToString buf2)) in - case lookupUFM pragmaKeywordsFM lexeme of - Just kw -> cont kw (mergeLexemes buf buf2) - Nothing -> panic "lex_prag" - }} - -------------------------------------------------------------------------------- --- Strings & Chars - -lex_string cont exts s buf - = case currentChar# buf of - '"'#{-"-} -> - let buf' = incCurrentPos buf - in case currentChar# buf' of - '#'# | glaExtsEnabled exts -> - if any (> 0xFF) s - then lexError "primitive string literal must contain only characters <= \'\\xFF\'" buf' - else let s' = mkFastStringNarrow (map chr (reverse s)) in - -- always a narrow string/byte array - cont (ITprimstring s') (incCurrentPos buf') - - _other -> let s' = mkFastString (map chr (reverse s)) - in cont (ITstring s') buf' - - -- ignore \& in a string, deal with string gaps - '\\'# | next_ch `eqChar#` '&'# - -> lex_string cont exts s buf' - | is_space next_ch - -> lex_stringgap cont exts s (incCurrentPos buf) - - where next_ch = lookAhead# buf 1# - buf' = addToCurrentPos buf 2# - - _ -> lex_char (lex_next_string cont s) exts buf - -lex_stringgap cont exts s buf - = let buf' = incCurrentPos buf in - case currentChar# buf of - '\n'# -> \st@PState{loc = loc} -> lex_stringgap cont exts s buf' - st{loc = incSrcLine loc} - '\\'# -> lex_string cont exts s buf' - c | is_space c -> lex_stringgap cont exts s buf' - other -> charError buf' - -lex_next_string cont s exts c buf = lex_string cont exts (c:s) buf - -lex_char :: (Int# -> Int -> P a) -> Int# -> P a -lex_char cont exts buf - = case currentChar# buf of - '\\'# -> lex_escape (cont exts) (incCurrentPos buf) - c | is_any c -> cont exts (I# (ord# c)) (incCurrentPos buf) - other -> charError buf - -char_end cont exts c buf - = case currentChar# buf of - '\''# -> let buf' = incCurrentPos buf in - case currentChar# buf' of - '#'# | glaExtsEnabled exts - -> cont (ITprimchar c) (incCurrentPos buf') - _ -> cont (ITchar c) buf' - _ -> charError buf - -lex_escape cont buf - = let buf' = incCurrentPos buf in - case currentChar# buf of - 'a'# -> cont (ord '\a') buf' - 'b'# -> cont (ord '\b') buf' - 'f'# -> cont (ord '\f') buf' - 'n'# -> cont (ord '\n') buf' - 'r'# -> cont (ord '\r') buf' - 't'# -> cont (ord '\t') buf' - 'v'# -> cont (ord '\v') buf' - '\\'# -> cont (ord '\\') buf' - '"'# -> cont (ord '\"') buf' - '\''# -> cont (ord '\'') buf' - '^'# -> let c = currentChar# buf' in - if c `geChar#` '@'# && c `leChar#` '_'# - then cont (I# (ord# c -# ord# '@'#)) (incCurrentPos buf') - else charError buf' - - 'x'# -> readNum (after_charnum cont) buf' is_hexdigit 16 hex - 'o'# -> readNum (after_charnum cont) buf' is_octdigit 8 oct_or_dec - x | is_digit x - -> readNum (after_charnum cont) buf is_digit 10 oct_or_dec - - _ -> case [ (c,buf2) | (p,c) <- silly_escape_chars, - Just buf2 <- [prefixMatch buf p] ] of - (c,buf2):_ -> cont (ord c) buf2 - [] -> charError buf' - -after_charnum cont i buf - = if i >= 0 && i <= 0x10FFFF - then cont (fromInteger i) buf - else charError buf - -readNum cont buf is_digit base conv = read buf 0 - where read buf i - = case currentChar# buf of { c -> - if is_digit c - then read (incCurrentPos buf) (i*base + (toInteger (I# (conv c)))) - else cont i buf - } - -is_hexdigit c - = is_digit c - || (c `geChar#` 'a'# && c `leChar#` 'f'#) - || (c `geChar#` 'A'# && c `leChar#` 'F'#) - -hex c | is_digit c = ord# c -# ord# '0'# - | otherwise = ord# (to_lower c) -# ord# 'a'# +# 10# -oct_or_dec c = ord# c -# ord# '0'# - -is_octdigit c = c `geChar#` '0'# && c `leChar#` '7'# - -to_lower c - | c `geChar#` 'A'# && c `leChar#` 'Z'# - = chr# (ord# c -# (ord# 'A'# -# ord# 'a'#)) - | otherwise = c - -charError buf = lexError "error in character literal" buf - -silly_escape_chars = [ - ("NUL", '\NUL'), - ("SOH", '\SOH'), - ("STX", '\STX'), - ("ETX", '\ETX'), - ("EOT", '\EOT'), - ("ENQ", '\ENQ'), - ("ACK", '\ACK'), - ("BEL", '\BEL'), - ("BS", '\BS'), - ("HT", '\HT'), - ("LF", '\LF'), - ("VT", '\VT'), - ("FF", '\FF'), - ("CR", '\CR'), - ("SO", '\SO'), - ("SI", '\SI'), - ("DLE", '\DLE'), - ("DC1", '\DC1'), - ("DC2", '\DC2'), - ("DC3", '\DC3'), - ("DC4", '\DC4'), - ("NAK", '\NAK'), - ("SYN", '\SYN'), - ("ETB", '\ETB'), - ("CAN", '\CAN'), - ("EM", '\EM'), - ("SUB", '\SUB'), - ("ESC", '\ESC'), - ("FS", '\FS'), - ("GS", '\GS'), - ("RS", '\RS'), - ("US", '\US'), - ("SP", '\SP'), - ("DEL", '\DEL') - ] - ------------------------------------------------------------------------------ --- Numbers - -lex_num :: (Token -> P a) -> Int# -> Integer -> P a -lex_num cont exts acc buf = - case scanNumLit acc buf of - (acc',buf') -> - case currentChar# buf' of - '.'# | is_digit (lookAhead# buf' 1#) -> - -- this case is not optimised at all, as the - -- presence of floating point numbers in interface - -- files is not that common. (ToDo) - case expandWhile# is_digit (incCurrentPos buf') of - buf2 -> -- points to first non digit char - case currentChar# buf2 of - 'E'# -> float_exponent cont exts buf2 - 'e'# -> float_exponent cont exts buf2 - _ -> float_done cont exts buf2 - - -- numbers like '9e4' are floats - 'E'# -> float_exponent cont exts buf' - 'e'# -> float_exponent cont exts buf' - _ -> after_lexnum cont exts acc' buf' -- it's an integer - -float_exponent cont exts buf2 = - let buf3 = incCurrentPos buf2 - buf4 = case currentChar# buf3 of - '-'# | is_digit (lookAhead# buf3 1#) - -> expandWhile# is_digit (incCurrentPos buf3) - '+'# | is_digit (lookAhead# buf3 1#) - -> expandWhile# is_digit (incCurrentPos buf3) - x | is_digit x -> expandWhile# is_digit buf3 - _ -> buf2 - in - float_done cont exts buf4 - -float_done cont exts buf = - case currentChar# buf of -- glasgow exts only - '#'# | glaExtsEnabled exts -> - let buf' = incCurrentPos buf in - case currentChar# buf' of - '#'# -> cont (ITprimdouble v) (incCurrentPos buf') - _ -> cont (ITprimfloat v) buf' - _ -> cont (ITrational v) buf - where - v = readRational__ (lexemeToString buf) - -after_lexnum cont exts i buf - = case currentChar# buf of - '#'# | glaExtsEnabled exts -> cont (ITprimint i) (incCurrentPos buf) - _ -> cont (ITinteger i) buf - -readRational :: ReadS Rational -- NB: doesn't handle leading "-" -readRational r = do - (n,d,s) <- readFix r - (k,t) <- readExp s - return ((n%1)*10^^(k-d), t) - where - readFix r = do - (ds,s) <- lexDecDigits r - (ds',t) <- lexDotDigits s - return (read (ds++ds'), length ds', t) - - readExp (e:s) | e `elem` "eE" = readExp' s - readExp s = return (0,s) - - readExp' ('+':s) = readDec s - readExp' ('-':s) = do - (k,t) <- readDec s - return (-k,t) - readExp' s = readDec s - - readDec s = do - (ds,r) <- nonnull isDigit s - return (foldl1 (\n d -> n * 10 + d) [ ord d - ord '0' | d <- ds ], - r) - - lexDecDigits = nonnull isDigit - - lexDotDigits ('.':s) = return (span isDigit s) - lexDotDigits s = return ("",s) - - nonnull p s = do (cs@(_:_),t) <- return (span p s) - return (cs,t) - -readRational__ :: String -> Rational -- NB: *does* handle a leading "-" -readRational__ top_s - = case top_s of - '-' : xs -> - (read_me xs) - xs -> read_me xs - where - read_me s - = case (do { (x,"") <- readRational s ; return x }) of - [x] -> x - [] -> error ("readRational__: no parse:" ++ top_s) - _ -> error ("readRational__: ambiguous parse:" ++ top_s) - ------------------------------------------------------------------------------ --- C "literal literal"s (i.e. things like ``NULL'', ``stdout'' etc.) - --- we lexemeToFastString on the bit between the ``''s, but include the --- quotes in the full lexeme. - -lex_cstring cont buf = - case expandUntilMatch (stepOverLexeme buf) "\'\'" of - Just buf' -> cont (ITlitlit (lexemeToFastString - (addToCurrentPos buf' (negateInt# 2#)))) - (mergeLexemes buf buf') - Nothing -> lexError "unterminated ``" buf - ------------------------------------------------------------------------------ --- identifiers, symbols etc. - --- used for identifiers with special prefixes like --- ?x (implicit parameters), $x (MetaHaskell escapes) and #x --- we've already seen the prefix char, so look for an id, and wrap --- the new "ip_constr" around the lexeme returned - -specialPrefixId ip_constr cont exts buf = lex_id newcont exts buf - where newcont (ITvarid lexeme) buf2 = cont (ip_constr (tailFS lexeme)) buf2 - newcont token buf2 = cont token buf2 -{- - case expandWhile# is_ident buf of - buf' -> cont (ip_constr (tailFS lexeme)) buf' - where lexeme = lexemeToFastString buf' --} - -lex_id cont exts buf = - let buf1 = expandWhile# is_ident buf in - seq buf1 $ - - case (if glaExtsEnabled exts - then expandWhile# (eqChar# '#'#) buf1 -- slurp trailing hashes - else buf1) of { buf' -> - seq buf' $ - - let lexeme = lexemeToFastString buf' in - - case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of { - Just kwd_token -> --trace ("hkeywd: "++unpackFS(lexeme)) $ - cont kwd_token buf'; - Nothing -> - - let var_token = cont (ITvarid lexeme) buf' in - - case lookupUFM ghcExtensionKeywordsFM lexeme of { - Just (kwd_token, validExts) - | validExts .&. (toInt32 exts) /= 0 -> cont kwd_token buf'; - _ -> var_token - - }}} - -lex_sym cont exts buf = - -- trace "lex_sym" $ - case expandWhile# is_symbol buf of - buf' -> case lookupUFM haskellKeySymsFM lexeme of { - Just (kwd_token, Nothing) - -> cont kwd_token buf' ; - Just (kwd_token, Just validExts) - | validExts .&. toInt32 exts /= 0 - -> cont kwd_token buf' ; - other -> cont (mk_var_token lexeme) buf' - } - where lexeme = lexemeToFastString buf' - - --- lex_con recursively collects components of a qualified identifer. --- The argument buf is the StringBuffer representing the lexeme --- identified so far, where the next character is upper-case. - -lex_con cont exts buf = - -- trace ("con: "{-++unpackFS lexeme-}) $ - let empty_buf = stepOverLexeme buf in - case expandWhile# is_ident empty_buf of { buf1 -> - case slurp_trailing_hashes buf1 exts of { con_buf -> - - let all_buf = mergeLexemes buf con_buf - - con_lexeme = lexemeToFastString con_buf - mod_lexeme = lexemeToFastString (decCurrentPos buf) - all_lexeme = lexemeToFastString all_buf - - just_a_conid - | emptyLexeme buf = cont (ITconid con_lexeme) all_buf - | otherwise = cont (ITqconid (mod_lexeme,con_lexeme)) all_buf - in - - case currentChar# all_buf of - '.'# -> maybe_qualified cont exts all_lexeme - (incCurrentPos all_buf) just_a_conid - _ -> just_a_conid - }} - - -maybe_qualified cont exts mod buf just_a_conid = - -- trace ("qid: "{-++unpackFS lexeme-}) $ - case currentChar# buf of - '['# -> -- Special case for [] - case lookAhead# buf 1# of - ']'# -> cont (ITqconid (mod,FSLIT("[]"))) (addToCurrentPos buf 2#) - _ -> just_a_conid - - '('# -> -- Special case for (,,,) - -- This *is* necessary to deal with e.g. "instance C PrelBase.(,,)" - case lookAhead# buf 1# of - '#'# | glaExtsEnabled exts -> case lookAhead# buf 2# of - ','# -> lex_ubx_tuple cont mod (addToCurrentPos buf 3#) - just_a_conid - _ -> just_a_conid - ')'# -> cont (ITqconid (mod,FSLIT("()"))) (addToCurrentPos buf 2#) - ','# -> lex_tuple cont mod (addToCurrentPos buf 2#) just_a_conid - _ -> just_a_conid - - '-'# -> case lookAhead# buf 1# of - '>'# -> cont (ITqconid (mod,FSLIT("(->)"))) (addToCurrentPos buf 2#) - _ -> lex_id3 cont exts mod buf just_a_conid - - _ -> lex_id3 cont exts mod buf just_a_conid - - -lex_id3 cont exts mod buf just_a_conid - | is_upper (currentChar# buf) = - lex_con cont exts buf - - | is_symbol (currentChar# buf) = - let - start_new_lexeme = stepOverLexeme buf - in - -- trace ("lex_id31 "{-++unpackFS lexeme-}) $ - case expandWhile# is_symbol start_new_lexeme of { buf' -> - let - lexeme = lexemeToFastString buf' - -- real lexeme is M. - new_buf = mergeLexemes buf buf' - in - cont (mk_qvar_token mod lexeme) new_buf - -- wrong, but arguably morally right: M... is now a qvarsym - } - - | otherwise = - let - start_new_lexeme = stepOverLexeme buf - in - -- trace ("lex_id32 "{-++unpackFS lexeme-}) $ - case expandWhile# is_ident start_new_lexeme of { buf1 -> - if emptyLexeme buf1 - then just_a_conid - else - - case slurp_trailing_hashes buf1 exts of { buf' -> - - let - lexeme = lexemeToFastString buf' - new_buf = mergeLexemes buf buf' - is_a_qvarid = cont (mk_qvar_token mod lexeme) new_buf - in - case _scc_ "haskellKeyword" lookupUFM haskellKeywordsFM lexeme of { - Nothing -> is_a_qvarid ; - - Just kwd_token | isSpecial kwd_token -- special ids (as, qualified, hiding) shouldn't be - -> is_a_qvarid -- recognised as keywords here. - | otherwise - -> just_a_conid -- avoid M.where etc. - }}} - -slurp_trailing_hashes buf exts - | glaExtsEnabled exts = expandWhile# (`eqChar#` '#'#) buf - | otherwise = buf - - -mk_var_token pk_str - | is_upper f = ITconid pk_str - | is_ident f = ITvarid pk_str - | f `eqChar#` ':'# = ITconsym pk_str - | otherwise = ITvarsym pk_str - where - (C# f) = headFS pk_str - -- tl = _TAIL_ pk_str - -mk_qvar_token m token = --- trace ("mk_qvar ") $ - case mk_var_token token of - ITconid n -> ITqconid (m,n) - ITvarid n -> ITqvarid (m,n) - ITconsym n -> ITqconsym (m,n) - ITvarsym n -> ITqvarsym (m,n) - _ -> ITunknown (show token) -\end{code} - ----------------------------------------------------------------------------- -Horrible stuff for dealing with M.(,,,) - -\begin{code} -lex_tuple cont mod buf back_off = - go 2 buf - where - go n buf = - case currentChar# buf of - ','# -> go (n+1) (stepOn buf) - ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Boxed n))) (stepOn buf) - _ -> back_off - -lex_ubx_tuple cont mod buf back_off = - go 2 buf - where - go n buf = - case currentChar# buf of - ','# -> go (n+1) (stepOn buf) - '#'# -> case lookAhead# buf 1# of - ')'# -> cont (ITqconid (mod, snd (mkTupNameStr Unboxed n))) - (stepOnBy# buf 2#) - _ -> back_off - _ -> back_off -\end{code} - ------------------------------------------------------------------------------ - -\begin{code} -data LayoutContext - = NoLayout - | Layout Int# - -data ParseResult a - = POk PState a - | PFailed Message - -data PState = PState { - loc :: SrcLoc, - extsBitmap :: Int#, -- bitmap that determines permitted extensions - bol :: Int#, - atbol :: Int#, - context :: [LayoutContext] - } - -type P a = StringBuffer -- Input string - -> PState - -> ParseResult a - -returnP :: a -> P a -returnP a buf s = POk s a - -thenP :: P a -> (a -> P b) -> P b -m `thenP` k = \ buf s -> - case m buf s of - POk s1 a -> k a buf s1 - PFailed err -> PFailed err - -thenP_ :: P a -> P b -> P b -m `thenP_` k = m `thenP` \_ -> k - -mapP :: (a -> P b) -> [a] -> P [b] -mapP f [] = returnP [] -mapP f (a:as) = - f a `thenP` \b -> - mapP f as `thenP` \bs -> - returnP (b:bs) - -failP :: String -> P a -failP msg buf s = PFailed (text msg) - -failMsgP :: Message -> P a -failMsgP msg buf s = PFailed msg - -lexError :: String -> P a -lexError str buf s@PState{ loc = loc } - = failMsgP (hcat [ppr loc, text ": ", text str]) buf s - -getSrcLocP :: P SrcLoc -getSrcLocP buf s@(PState{ loc = loc }) = POk s loc - --- use a temporary SrcLoc for the duration of the argument -setSrcLocP :: SrcLoc -> P a -> P a -setSrcLocP new_loc p buf s = - case p buf s{ loc=new_loc } of - POk _ a -> POk s a - PFailed e -> PFailed e - -getSrcFile :: P FastString -getSrcFile buf s@(PState{ loc = loc }) = POk s (srcLocFile loc) - -pushContext :: LayoutContext -> P () -pushContext ctxt buf s@(PState{ context = ctx }) = POk s{context = ctxt:ctx} () - -{- - -This special case in layoutOn is to handle layout contexts with are -indented the same or less than the current context. This is illegal -according to the Haskell spec, so we have to arrange to close the -current context. eg. - -class Foo a where -class Bar a - -after the first 'where', the sequence of events is: - - - layout system inserts a ';' (column 0) - - parser begins a new context at column 0 - - parser shifts ';' (legal empty declaration) - - parser sees 'class': parse error (we're still in the inner context) - -trouble is, by the time we know we need a new context, the lexer has -already generated the ';'. Hacky solution is as follows: since we -know the column of the next token (it's the column number of the new -context), we set the ACTUAL column number of the new context to this -numer plus one. Hence the next time the lexer is called, a '}' will -be generated to close the new context straight away. Furthermore, we -have to set the atbol flag so that the ';' that the parser shifted as -part of the new context is re-generated. - -when the new context is *less* indented than the current one: - -f = f where g = g where -h = h - - - current context: column 12. - - on seeing 'h' (column 0), the layout system inserts '}' - - parser starts a new context, column 0 - - parser sees '}', uses it to close new context - - we still need to insert another '}' followed by a ';', - hence the atbol trick. - -There's also a special hack in here to deal with - - do - .... - e $ do - blah - -i.e. the inner context is at the same indentation level as the outer -context. This is strictly illegal according to Haskell 98, but -there's a lot of existing code using this style and it doesn't make -any sense to disallow it, since empty 'do' lists don't make sense. --} - -layoutOn :: Bool -> P () -layoutOn strict buf s@(PState{ bol = bol, context = ctx }) = - let offset = lexemeIndex buf -# bol in - case ctx of - Layout prev_off : _ - | if strict then prev_off >=# offset else prev_off ># offset -> - --trace ("layout on, column: " ++ show (I# offset)) $ - POk s{ context = Layout (offset +# 1#) : ctx, atbol = 1# } () - other -> - --trace ("layout on, column: " ++ show (I# offset)) $ - POk s{ context = Layout offset : ctx } () - -layoutOff :: P () -layoutOff buf s@(PState{ context = ctx }) = - POk s{ context = NoLayout:ctx } () - -popContext :: P () -popContext = \ buf s@(PState{ context = ctx, loc = loc }) -> - case ctx of - (_:tl) -> POk s{ context = tl } () - [] -> PFailed (srcParseErr buf loc) - --- for reasons of efficiency, flags indicating language extensions (eg, --- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed --- integer - -glaExtsBit, ffiBit, parrBit :: Int -glaExtsBit = 0 -ffiBit = 1 -parrBit = 2 -withBit = 3 -arrowsBit = 4 - -glaExtsEnabled, ffiEnabled, parrEnabled :: Int# -> Bool -glaExtsEnabled flags = testBit (toInt32 flags) glaExtsBit -ffiEnabled flags = testBit (toInt32 flags) ffiBit -withEnabled flags = testBit (toInt32 flags) withBit -parrEnabled flags = testBit (toInt32 flags) parrBit -arrowsEnabled flags = testBit (toInt32 flags) arrowsBit - -toInt32 :: Int# -> Int32 -toInt32 x# = fromIntegral (I# x#) - --- convenient record-based bitmap for the interface to the rest of the world --- --- NB: `glasgowExtsEF' implies `ffiEF' (see `mkPState' below) --- -data ExtFlags = ExtFlags { - glasgowExtsEF :: Bool, - ffiEF :: Bool, - withEF :: Bool, - parrEF :: Bool, - arrowsEF :: Bool - } - --- create a parse state --- -mkPState :: SrcLoc -> ExtFlags -> PState -mkPState loc exts = - PState { - loc = loc, - extsBitmap = case (fromIntegral bitmap) of {I# bits -> bits}, - bol = 0#, - atbol = 1#, - context = [] - } - where - bitmap = glaExtsBit `setBitIf` glasgowExtsEF exts - .|. ffiBit `setBitIf` (ffiEF exts - || glasgowExtsEF exts) - .|. withBit `setBitIf` withEF exts - .|. parrBit `setBitIf` parrEF exts - .|. arrowsBit `setBitIf` arrowsEF exts - -- - setBitIf :: Int -> Bool -> Int32 - b `setBitIf` cond | cond = bit b - | otherwise = 0 - ------------------------------------------------------------------------------ - -srcParseErr :: StringBuffer -> SrcLoc -> Message -srcParseErr s l - = hcat [ppr l, - if null token - then ptext SLIT(": parse error (possibly incorrect indentation)") - else hcat [ptext SLIT(": parse error on input "), - char '`', text token, char '\''] - ] - where - token = lexemeToString s - -\end{code} -- 1.7.10.4