compiler/parser/Lexer.x

   1 -----------------------------------------------------------------------------
   2 -- (c) The University of Glasgow, 2006
   3 --
   4 -- GHC's lexer.
   5 --
   6 -- This is a combination of an Alex-generated lexer from a regex
   7 -- definition, with some hand-coded bits.
   8 --
   9 -- Completely accurate information about token-spans within the source
  10 -- file is maintained.  Every token has a start and end SrcLoc attached to it.
  11 --
  12 -----------------------------------------------------------------------------
  13
  14 --   ToDo / known bugs:
  15 --    - Unicode
  16 --    - parsing integers is a bit slow
  17 --    - readRational is a bit slow
  18 --
  19 --   Known bugs, that were also in the previous version:
  20 --    - M... should be 3 tokens, not 1.
  21 --    - pragma-end should be only valid in a pragma
  22
  23 {
  24 module Lexer (
  25    Token(..), lexer, pragState, mkPState, PState(..),
  26    P(..), ParseResult(..), getSrcLoc,
  27    failLocMsgP, failSpanMsgP, srcParseFail,
  28    popContext, pushCurrentContext, setLastToken, setSrcLoc,
  29    getLexState, popLexState, pushLexState,
  30    extension, bangPatEnabled
  31   ) where
  32
  33 #include "HsVersions.h"
  34
  35 import ErrUtils         ( Message )
  36 import Outputable
  37 import StringBuffer
  38 import FastString
  39 import FastTypes
  40 import SrcLoc
  41 import UniqFM
  42 import DynFlags
  43 import Ctype
  44 import Util             ( maybePrefixMatch, readRational )
  45
  46 import DATA_BITS
  47 import Data.Char        ( chr )
  48 import Ratio
  49 --import TRACE
  50
  51 #if __GLASGOW_HASKELL__ >= 605
  52 import Data.Char        ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  53 #else
  54 import Compat.Unicode   ( GeneralCategory(..), generalCategory, isPrint, isUpper )
  55 #endif
  56 }
  57
  58 $unispace    = \x05
  59 $whitechar   = [\ \t\n\r\f\v\xa0 $unispace]
  60 $white_no_nl = $whitechar # \n
  61
  62 $ascdigit  = 0-9
  63 $unidigit  = \x03
  64 $decdigit  = $ascdigit -- for now, should really be $digit (ToDo)
  65 $digit     = [$ascdigit $unidigit]
  66
  67 $special   = [\(\)\,\;\[\]\`\{\}]
  68 $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \xa1-\xbf \xd7 \xf7]
  69 $unisymbol = \x04
  70 $symbol    = [$ascsymbol $unisymbol] # [$special \_\:\"\']
  71
  72 $unilarge  = \x01
  73 $asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
  74 $large     = [$asclarge $unilarge]
  75
  76 $unismall  = \x02
  77 $ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
  78 $small     = [$ascsmall $unismall \_]
  79
  80 $unigraphic = \x06
  81 $graphic   = [$small $large $symbol $digit $special $unigraphic \:\"\']
  82
  83 $octit     = 0-7
  84 $hexit     = [$decdigit A-F a-f]
  85 $symchar   = [$symbol \:]
  86 $nl        = [\n\r]
  87 $idchar    = [$small $large $digit \']
  88
  89 @varid     = $small $idchar*
  90 @conid     = $large $idchar*
  91
  92 @varsym    = $symbol $symchar*
  93 @consym    = \: $symchar*
  94
  95 @decimal     = $decdigit+
  96 @octal       = $octit+
  97 @hexadecimal = $hexit+
  98 @exponent    = [eE] [\-\+]? @decimal
  99
 100 -- we support the hierarchical module name extension:
 101 @qual = (@conid \.)+
 102
 103 @floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
 104
 105 haskell :-
 106
 107 -- everywhere: skip whitespace and comments
 108 $white_no_nl+                           ;
 109
 110 -- Everywhere: deal with nested comments.  We explicitly rule out
 111 -- pragmas, "{-#", so that we don't accidentally treat them as comments.
 112 -- (this can happen even though pragmas will normally take precedence due to
 113 -- longest-match, because pragmas aren't valid in every state, but comments
 114 -- are).
 115 "{-" / { notFollowedBy '#' }            { nested_comment }
 116
 117 -- Single-line comments are a bit tricky.  Haskell 98 says that two or
 118 -- more dashes followed by a symbol should be parsed as a varsym, so we
 119 -- have to exclude those.
 120 -- The regex says: "munch all the characters after the dashes, as long as
 121 -- the first one is not a symbol".
 122 "--"\-* [^$symbol :] .*                 ;
 123 "--"\-* / { atEOL }                     ;
 124
 125 -- 'bol' state: beginning of a line.  Slurp up all the whitespace (including
 126 -- blank lines) until we find a non-whitespace character, then do layout
 127 -- processing.
 128 --
 129 -- One slight wibble here: what if the line begins with {-#? In
 130 -- theory, we have to lex the pragma to see if it's one we recognise,
 131 -- and if it is, then we backtrack and do_bol, otherwise we treat it
 132 -- as a nested comment.  We don't bother with this: if the line begins
 133 -- with {-#, then we'll assume it's a pragma we know about and go for do_bol.
 134 <bol> {
 135   \n                                    ;
 136   ^\# (line)?                           { begin line_prag1 }
 137   ^\# pragma .* \n                      ; -- GCC 3.3 CPP generated, apparently
 138   ^\# \! .* \n                          ; -- #!, for scripts
 139   ()                                    { do_bol }
 140 }
 141
 142 -- after a layout keyword (let, where, do, of), we begin a new layout
 143 -- context if the curly brace is missing.
 144 -- Careful! This stuff is quite delicate.
 145 <layout, layout_do> {
 146   \{ / { notFollowedBy '-' }            { pop_and open_brace }
 147         -- we might encounter {-# here, but {- has been handled already
 148   \n                                    ;
 149   ^\# (line)?                           { begin line_prag1 }
 150 }
 151
 152 -- do is treated in a subtly different way, see new_layout_context
 153 <layout>    ()                          { new_layout_context True }
 154 <layout_do> ()                          { new_layout_context False }
 155
 156 -- after a new layout context which was found to be to the left of the
 157 -- previous context, we have generated a '{' token, and we now need to
 158 -- generate a matching '}' token.
 159 <layout_left>  ()                       { do_layout_left }
 160
 161 <0,option_prags,glaexts> \n                             { begin bol }
 162
 163 "{-#" $whitechar* (line|LINE)           { begin line_prag2 }
 164
 165 -- single-line line pragmas, of the form
 166 --    # <line> "<file>" <extra-stuff> \n
 167 <line_prag1> $decdigit+                 { setLine line_prag1a }
 168 <line_prag1a> \" [$graphic \ ]* \"      { setFile line_prag1b }
 169 <line_prag1b> .*                        { pop }
 170
 171 -- Haskell-style line pragmas, of the form
 172 --    {-# LINE <line> "<file>" #-}
 173 <line_prag2> $decdigit+                 { setLine line_prag2a }
 174 <line_prag2a> \" [$graphic \ ]* \"      { setFile line_prag2b }
 175 <line_prag2b> "#-}"|"-}"                { pop }
 176    -- NOTE: accept -} at the end of a LINE pragma, for compatibility
 177    -- with older versions of GHC which generated these.
 178
 179 -- We only want RULES pragmas to be picked up when -fglasgow-exts
 180 -- is on, because the contents of the pragma is always written using
 181 -- glasgow-exts syntax (using forall etc.), so if glasgow exts are not
 182 -- enabled, we're sure to get a parse error.
 183 -- (ToDo: we should really emit a warning when ignoring pragmas)
 184 <glaexts>
 185   "{-#" $whitechar* (RULES|rules)       { token ITrules_prag }
 186
 187 <0,option_prags,glaexts> {
 188   "{-#" $whitechar* (INLINE|inline)     { token (ITinline_prag True) }
 189   "{-#" $whitechar* (NO(T?)INLINE|no(t?)inline)
 190                                         { token (ITinline_prag False) }
 191   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 192                                         { token ITspec_prag }
 193   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 194         $whitechar* (INLINE|inline)     { token (ITspec_inline_prag True) }
 195   "{-#" $whitechar* (SPECIALI[SZ]E|speciali[sz]e)
 196         $whitechar* (NO(T?)INLINE|no(t?)inline)
 197                                         { token (ITspec_inline_prag False) }
 198   "{-#" $whitechar* (SOURCE|source)     { token ITsource_prag }
 199   "{-#" $whitechar* (DEPRECATED|deprecated)
 200                                         { token ITdeprecated_prag }
 201   "{-#" $whitechar* (SCC|scc)           { token ITscc_prag }
 202   "{-#" $whitechar* (CORE|core)         { token ITcore_prag }
 203   "{-#" $whitechar* (UNPACK|unpack)     { token ITunpack_prag }
 204
 205   "{-#"                                 { nested_comment }
 206
 207   -- ToDo: should only be valid inside a pragma:
 208   "#-}"                                 { token ITclose_prag}
 209 }
 210
 211 <option_prags> {
 212   "{-#" $whitechar* (OPTIONS|options)   { lex_string_prag IToptions_prag }
 213   "{-#" $whitechar* (OPTIONS_GHC|options_ghc)
 214                                         { lex_string_prag IToptions_prag }
 215   "{-#" $whitechar* (LANGUAGE|language) { token ITlanguage_prag }
 216   "{-#" $whitechar* (INCLUDE|include)   { lex_string_prag ITinclude_prag }
 217 }
 218
 219 <0,option_prags,glaexts> {
 220         -- This is to catch things like {-# OPTIONS OPTIONS_HUGS ...
 221   "{-#" $whitechar* $idchar+            { nested_comment }
 222 }
 223
 224 -- '0' state: ordinary lexemes
 225 -- 'glaexts' state: glasgow extensions (postfix '#', etc.)
 226
 227 -- "special" symbols
 228
 229 <0,glaexts> {
 230   "[:" / { ifExtension parrEnabled }    { token ITopabrack }
 231   ":]" / { ifExtension parrEnabled }    { token ITcpabrack }
 232 }
 233
 234 <0,glaexts> {
 235   "[|"      / { ifExtension thEnabled } { token ITopenExpQuote }
 236   "[e|"     / { ifExtension thEnabled } { token ITopenExpQuote }
 237   "[p|"     / { ifExtension thEnabled } { token ITopenPatQuote }
 238   "[d|"     / { ifExtension thEnabled } { layout_token ITopenDecQuote }
 239   "[t|"     / { ifExtension thEnabled } { token ITopenTypQuote }
 240   "|]"      / { ifExtension thEnabled } { token ITcloseQuote }
 241   \$ @varid / { ifExtension thEnabled } { skip_one_varid ITidEscape }
 242   "$("      / { ifExtension thEnabled } { token ITparenEscape }
 243 }
 244
 245 <0,glaexts> {
 246   "(|" / { ifExtension arrowsEnabled `alexAndPred` notFollowedBySymbol }
 247                                         { special IToparenbar }
 248   "|)" / { ifExtension arrowsEnabled }  { special ITcparenbar }
 249 }
 250
 251 <0,glaexts> {
 252   \? @varid / { ifExtension ipEnabled } { skip_one_varid ITdupipvarid }
 253 }
 254
 255 <glaexts> {
 256   "(#" / { notFollowedBySymbol }        { token IToubxparen }
 257   "#)"                                  { token ITcubxparen }
 258   "{|"                                  { token ITocurlybar }
 259   "|}"                                  { token ITccurlybar }
 260 }
 261
 262 <0,option_prags,glaexts> {
 263   \(                                    { special IToparen }
 264   \)                                    { special ITcparen }
 265   \[                                    { special ITobrack }
 266   \]                                    { special ITcbrack }
 267   \,                                    { special ITcomma }
 268   \;                                    { special ITsemi }
 269   \`                                    { special ITbackquote }
 270
 271   \{                                    { open_brace }
 272   \}                                    { close_brace }
 273 }
 274
 275 <0,option_prags,glaexts> {
 276   @qual @varid                  { check_qvarid }
 277   @qual @conid                  { idtoken qconid }
 278   @varid                        { varid }
 279   @conid                        { idtoken conid }
 280 }
 281
 282 -- after an illegal qvarid, such as 'M.let',
 283 -- we back up and try again in the bad_qvarid state:
 284 <bad_qvarid> {
 285   @conid                        { pop_and (idtoken conid) }
 286   @qual @conid                  { pop_and (idtoken qconid) }
 287 }
 288
 289 <glaexts> {
 290   @qual @varid "#"+             { idtoken qvarid }
 291   @qual @conid "#"+             { idtoken qconid }
 292   @varid "#"+                   { varid }
 293   @conid "#"+                   { idtoken conid }
 294 }
 295
 296 -- ToDo: M.(,,,)
 297
 298 <0,glaexts> {
 299   @qual @varsym                 { idtoken qvarsym }
 300   @qual @consym                 { idtoken qconsym }
 301   @varsym                       { varsym }
 302   @consym                       { consym }
 303 }
 304
 305 <0,glaexts> {
 306   @decimal                      { tok_decimal }
 307   0[oO] @octal                  { tok_octal }
 308   0[xX] @hexadecimal            { tok_hexadecimal }
 309 }
 310
 311 <glaexts> {
 312   @decimal \#                   { prim_decimal }
 313   0[oO] @octal \#               { prim_octal }
 314   0[xX] @hexadecimal \#         { prim_hexadecimal }
 315 }
 316
 317 <0,glaexts> @floating_point             { strtoken tok_float }
 318 <glaexts>   @floating_point \#          { init_strtoken 1 prim_float }
 319 <glaexts>   @floating_point \# \#       { init_strtoken 2 prim_double }
 320
 321 -- Strings and chars are lexed by hand-written code.  The reason is
 322 -- that even if we recognise the string or char here in the regex
 323 -- lexer, we would still have to parse the string afterward in order
 324 -- to convert it to a String.
 325 <0,glaexts> {
 326   \'                            { lex_char_tok }
 327   \"                            { lex_string_tok }
 328 }
 329
 330 {
 331 -- work around bug in Alex 2.0
 332 #if __GLASGOW_HASKELL__ < 503
 333 unsafeAt arr i = arr ! i
 334 #endif
 335
 336 -- -----------------------------------------------------------------------------
 337 -- The token type
 338
 339 data Token
 340   = ITas                        -- Haskell keywords
 341   | ITcase
 342   | ITclass
 343   | ITdata
 344   | ITdefault
 345   | ITderiving
 346   | ITdo
 347   | ITelse
 348   | IThiding
 349   | ITif
 350   | ITimport
 351   | ITin
 352   | ITinfix
 353   | ITinfixl
 354   | ITinfixr
 355   | ITinstance
 356   | ITlet
 357   | ITmodule
 358   | ITnewtype
 359   | ITof
 360   | ITqualified
 361   | ITthen
 362   | ITtype
 363   | ITwhere
 364   | ITscc                       -- ToDo: remove (we use {-# SCC "..." #-} now)
 365
 366   | ITforall                    -- GHC extension keywords
 367   | ITforeign
 368   | ITexport
 369   | ITlabel
 370   | ITdynamic
 371   | ITsafe
 372   | ITthreadsafe
 373   | ITunsafe
 374   | ITstdcallconv
 375   | ITccallconv
 376   | ITdotnet
 377   | ITmdo
 378   | ITiso
 379   | ITfamily
 380
 381         -- Pragmas
 382   | ITinline_prag Bool          -- True <=> INLINE, False <=> NOINLINE
 383   | ITspec_prag                 -- SPECIALISE
 384   | ITspec_inline_prag Bool     -- SPECIALISE INLINE (or NOINLINE)
 385   | ITsource_prag
 386   | ITrules_prag
 387   | ITdeprecated_prag
 388   | ITline_prag
 389   | ITscc_prag
 390   | ITcore_prag                 -- hdaume: core annotations
 391   | ITunpack_prag
 392   | ITclose_prag
 393   | IToptions_prag String
 394   | ITinclude_prag String
 395   | ITlanguage_prag
 396
 397   | ITdotdot                    -- reserved symbols
 398   | ITcolon
 399   | ITdcolon
 400   | ITequal
 401   | ITlam
 402   | ITvbar
 403   | ITlarrow
 404   | ITrarrow
 405   | ITat
 406   | ITtilde
 407   | ITdarrow
 408   | ITminus
 409   | ITbang
 410   | ITstar
 411   | ITdot
 412
 413   | ITbiglam                    -- GHC-extension symbols
 414
 415   | ITocurly                    -- special symbols
 416   | ITccurly
 417   | ITocurlybar                 -- {|, for type applications
 418   | ITccurlybar                 -- |}, for type applications
 419   | ITvocurly
 420   | ITvccurly
 421   | ITobrack
 422   | ITopabrack                  -- [:, for parallel arrays with -fparr
 423   | ITcpabrack                  -- :], for parallel arrays with -fparr
 424   | ITcbrack
 425   | IToparen
 426   | ITcparen
 427   | IToubxparen
 428   | ITcubxparen
 429   | ITsemi
 430   | ITcomma
 431   | ITunderscore
 432   | ITbackquote
 433
 434   | ITvarid   FastString        -- identifiers
 435   | ITconid   FastString
 436   | ITvarsym  FastString
 437   | ITconsym  FastString
 438   | ITqvarid  (FastString,FastString)
 439   | ITqconid  (FastString,FastString)
 440   | ITqvarsym (FastString,FastString)
 441   | ITqconsym (FastString,FastString)
 442
 443   | ITdupipvarid   FastString   -- GHC extension: implicit param: ?x
 444
 445   | ITpragma StringBuffer
 446
 447   | ITchar       Char
 448   | ITstring     FastString
 449   | ITinteger    Integer
 450   | ITrational   Rational
 451
 452   | ITprimchar   Char
 453   | ITprimstring FastString
 454   | ITprimint    Integer
 455   | ITprimfloat  Rational
 456   | ITprimdouble Rational
 457
 458   -- MetaHaskell extension tokens
 459   | ITopenExpQuote              --  [| or [e|
 460   | ITopenPatQuote              --  [p|
 461   | ITopenDecQuote              --  [d|
 462   | ITopenTypQuote              --  [t|
 463   | ITcloseQuote                --  |]
 464   | ITidEscape   FastString     --  $x
 465   | ITparenEscape               --  $(
 466   | ITvarQuote                  --  '
 467   | ITtyQuote                   --  ''
 468
 469   -- Arrow notation extension
 470   | ITproc
 471   | ITrec
 472   | IToparenbar                 --  (|
 473   | ITcparenbar                 --  |)
 474   | ITlarrowtail                --  -<
 475   | ITrarrowtail                --  >-
 476   | ITLarrowtail                --  -<<
 477   | ITRarrowtail                --  >>-
 478
 479   | ITunknown String            -- Used when the lexer can't make sense of it
 480   | ITeof                       -- end of file token
 481 #ifdef DEBUG
 482   deriving Show -- debugging
 483 #endif
 484
 485 isSpecial :: Token -> Bool
 486 -- If we see M.x, where x is a keyword, but
 487 -- is special, we treat is as just plain M.x,
 488 -- not as a keyword.
 489 isSpecial ITas          = True
 490 isSpecial IThiding      = True
 491 isSpecial ITqualified   = True
 492 isSpecial ITforall      = True
 493 isSpecial ITexport      = True
 494 isSpecial ITlabel       = True
 495 isSpecial ITdynamic     = True
 496 isSpecial ITsafe        = True
 497 isSpecial ITthreadsafe  = True
 498 isSpecial ITunsafe      = True
 499 isSpecial ITccallconv   = True
 500 isSpecial ITstdcallconv = True
 501 isSpecial ITmdo         = True
 502 isSpecial ITiso         = True
 503 isSpecial ITfamily      = True
 504 isSpecial _             = False
 505
 506 -- the bitmap provided as the third component indicates whether the
 507 -- corresponding extension keyword is valid under the extension options
 508 -- provided to the compiler; if the extension corresponding to *any* of the
 509 -- bits set in the bitmap is enabled, the keyword is valid (this setup
 510 -- facilitates using a keyword in two different extensions that can be
 511 -- activated independently)
 512 --
 513 reservedWordsFM = listToUFM $
 514         map (\(x, y, z) -> (mkFastString x, (y, z)))
 515        [( "_",          ITunderscore,   0 ),
 516         ( "as",         ITas,           0 ),
 517         ( "case",       ITcase,         0 ),
 518         ( "class",      ITclass,        0 ),
 519         ( "data",       ITdata,         0 ),
 520         ( "default",    ITdefault,      0 ),
 521         ( "deriving",   ITderiving,     0 ),
 522         ( "do",         ITdo,           0 ),
 523         ( "else",       ITelse,         0 ),
 524         ( "hiding",     IThiding,       0 ),
 525         ( "if",         ITif,           0 ),
 526         ( "import",     ITimport,       0 ),
 527         ( "in",         ITin,           0 ),
 528         ( "infix",      ITinfix,        0 ),
 529         ( "infixl",     ITinfixl,       0 ),
 530         ( "infixr",     ITinfixr,       0 ),
 531         ( "instance",   ITinstance,     0 ),
 532         ( "let",        ITlet,          0 ),
 533         ( "module",     ITmodule,       0 ),
 534         ( "newtype",    ITnewtype,      0 ),
 535         ( "of",         ITof,           0 ),
 536         ( "qualified",  ITqualified,    0 ),
 537         ( "then",       ITthen,         0 ),
 538         ( "type",       ITtype,         0 ),
 539         ( "where",      ITwhere,        0 ),
 540         ( "_scc_",      ITscc,          0 ),            -- ToDo: remove
 541
 542         ( "forall",     ITforall,        bit tvBit),
 543         ( "mdo",        ITmdo,           bit glaExtsBit),
 544         ( "family",     ITfamily,        bit idxTysBit),
 545
 546         ( "foreign",    ITforeign,       bit ffiBit),
 547         ( "export",     ITexport,        bit ffiBit),
 548         ( "label",      ITlabel,         bit ffiBit),
 549         ( "dynamic",    ITdynamic,       bit ffiBit),
 550         ( "safe",       ITsafe,          bit ffiBit),
 551         ( "threadsafe", ITthreadsafe,    bit ffiBit),
 552         ( "unsafe",     ITunsafe,        bit ffiBit),
 553         ( "stdcall",    ITstdcallconv,   bit ffiBit),
 554         ( "ccall",      ITccallconv,     bit ffiBit),
 555         ( "dotnet",     ITdotnet,        bit ffiBit),
 556
 557         ( "rec",        ITrec,           bit arrowsBit),
 558         ( "proc",       ITproc,          bit arrowsBit)
 559      ]
 560
 561 reservedSymsFM = listToUFM $
 562         map (\ (x,y,z) -> (mkFastString x,(y,z)))
 563       [ ("..",  ITdotdot,       0)
 564        ,(":",   ITcolon,        0)      -- (:) is a reserved op,
 565                                                 -- meaning only list cons
 566        ,("::",  ITdcolon,       0)
 567        ,("=",   ITequal,        0)
 568        ,("\\",  ITlam,          0)
 569        ,("|",   ITvbar,         0)
 570        ,("<-",  ITlarrow,       0)
 571        ,("->",  ITrarrow,       0)
 572        ,("@",   ITat,           0)
 573        ,("~",   ITtilde,        0)
 574        ,("=>",  ITdarrow,       0)
 575        ,("-",   ITminus,        0)
 576        ,("!",   ITbang,         0)
 577
 578        ,("*",   ITstar,         bit glaExtsBit .|.
 579                                 bit idxTysBit)      -- For data T (a::*) = MkT
 580        ,(".",   ITdot,          bit tvBit)          -- For 'forall a . t'
 581
 582        ,("-<",  ITlarrowtail,   bit arrowsBit)
 583        ,(">-",  ITrarrowtail,   bit arrowsBit)
 584        ,("-<<", ITLarrowtail,   bit arrowsBit)
 585        ,(">>-", ITRarrowtail,   bit arrowsBit)
 586
 587 #if __GLASGOW_HASKELL__ >= 605
 588        ,("λ",  ITlam,          bit glaExtsBit)
 589        ,("∷",   ITdcolon,       bit glaExtsBit)
 590        ,("⇒",   ITdarrow,     bit glaExtsBit)
 591        ,("∀", ITforall,       bit glaExtsBit)
 592        ,("→",   ITrarrow,     bit glaExtsBit)
 593        ,("←",   ITlarrow,     bit glaExtsBit)
 594        ,("⋯",         ITdotdot,       bit glaExtsBit)
 595         -- ToDo: ideally, → and ∷ should be "specials", so that they cannot
 596         -- form part of a large operator.  This would let us have a better
 597         -- syntax for kinds: ɑ∷*→* would be a legal kind signature. (maybe).
 598 #endif
 599        ]
 600
 601 -- -----------------------------------------------------------------------------
 602 -- Lexer actions
 603
 604 type Action = SrcSpan -> StringBuffer -> Int -> P (Located Token)
 605
 606 special :: Token -> Action
 607 special tok span _buf len = return (L span tok)
 608
 609 token, layout_token :: Token -> Action
 610 token t span buf len = return (L span t)
 611 layout_token t span buf len = pushLexState layout >> return (L span t)
 612
 613 idtoken :: (StringBuffer -> Int -> Token) -> Action
 614 idtoken f span buf len = return (L span $! (f buf len))
 615
 616 skip_one_varid :: (FastString -> Token) -> Action
 617 skip_one_varid f span buf len
 618   = return (L span $! f (lexemeToFastString (stepOn buf) (len-1)))
 619
 620 strtoken :: (String -> Token) -> Action
 621 strtoken f span buf len =
 622   return (L span $! (f $! lexemeToString buf len))
 623
 624 init_strtoken :: Int -> (String -> Token) -> Action
 625 -- like strtoken, but drops the last N character(s)
 626 init_strtoken drop f span buf len =
 627   return (L span $! (f $! lexemeToString buf (len-drop)))
 628
 629 begin :: Int -> Action
 630 begin code _span _str _len = do pushLexState code; lexToken
 631
 632 pop :: Action
 633 pop _span _buf _len = do popLexState; lexToken
 634
 635 pop_and :: Action -> Action
 636 pop_and act span buf len = do popLexState; act span buf len
 637
 638 notFollowedBy char _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf /= char
 639
 640 notFollowedBySymbol _ _ _ (AI _ _ buf)
 641   = atEnd buf || currentChar buf `notElem` "!#$%&*+./<=>?@\\^|-~"
 642
 643 atEOL _ _ _ (AI _ _ buf) = atEnd buf || currentChar buf == '\n'
 644
 645 ifExtension pred bits _ _ _ = pred bits
 646
 647 {-
 648   nested comments require traversing by hand, they can't be parsed
 649   using regular expressions.
 650 -}
 651 nested_comment :: Action
 652 nested_comment span _str _len = do
 653   input <- getInput
 654   go 1 input
 655   where go 0 input = do setInput input; lexToken
 656         go n input = do
 657           case alexGetChar input of
 658             Nothing  -> err input
 659             Just (c,input) -> do
 660               case c of
 661                 '-' -> do
 662                   case alexGetChar input of
 663                     Nothing  -> err input
 664                     Just ('\125',input) -> go (n-1) input
 665                     Just (c,_)          -> go n input
 666                 '\123' -> do
 667                   case alexGetChar input of
 668                     Nothing  -> err input
 669                     Just ('-',input') -> go (n+1) input'
 670                     Just (c,input)    -> go n input
 671                 c -> go n input
 672
 673         err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated `{-'"
 674
 675 open_brace, close_brace :: Action
 676 open_brace span _str _len = do
 677   ctx <- getContext
 678   setContext (NoLayout:ctx)
 679   return (L span ITocurly)
 680 close_brace span _str _len = do
 681   popContext
 682   return (L span ITccurly)
 683
 684 -- We have to be careful not to count M.<varid> as a qualified name
 685 -- when <varid> is a keyword.  We hack around this by catching
 686 -- the offending tokens afterward, and re-lexing in a different state.
 687 check_qvarid span buf len = do
 688   case lookupUFM reservedWordsFM var of
 689         Just (keyword,exts)
 690           | not (isSpecial keyword) ->
 691           if exts == 0
 692              then try_again
 693              else do
 694                 b <- extension (\i -> exts .&. i /= 0)
 695                 if b then try_again
 696                      else return token
 697         _other -> return token
 698   where
 699         (mod,var) = splitQualName buf len
 700         token     = L span (ITqvarid (mod,var))
 701
 702         try_again = do
 703                 (AI _ offs _) <- getInput
 704                 setInput (AI (srcSpanStart span) (offs-len) buf)
 705                 pushLexState bad_qvarid
 706                 lexToken
 707
 708 qvarid buf len = ITqvarid $! splitQualName buf len
 709 qconid buf len = ITqconid $! splitQualName buf len
 710
 711 splitQualName :: StringBuffer -> Int -> (FastString,FastString)
 712 -- takes a StringBuffer and a length, and returns the module name
 713 -- and identifier parts of a qualified name.  Splits at the *last* dot,
 714 -- because of hierarchical module names.
 715 splitQualName orig_buf len = split orig_buf orig_buf
 716   where
 717     split buf dot_buf
 718         | orig_buf `byteDiff` buf >= len  = done dot_buf
 719         | c == '.'                        = found_dot buf'
 720         | otherwise                       = split buf' dot_buf
 721       where
 722        (c,buf') = nextChar buf
 723
 724     -- careful, we might get names like M....
 725     -- so, if the character after the dot is not upper-case, this is
 726     -- the end of the qualifier part.
 727     found_dot buf -- buf points after the '.'
 728         | isUpper c    = split buf' buf
 729         | otherwise    = done buf
 730       where
 731        (c,buf') = nextChar buf
 732
 733     done dot_buf =
 734         (lexemeToFastString orig_buf (qual_size - 1),
 735          lexemeToFastString dot_buf (len - qual_size))
 736       where
 737         qual_size = orig_buf `byteDiff` dot_buf
 738
 739 varid span buf len =
 740   case lookupUFM reservedWordsFM fs of
 741         Just (keyword,0)    -> do
 742                 maybe_layout keyword
 743                 return (L span keyword)
 744         Just (keyword,exts) -> do
 745                 b <- extension (\i -> exts .&. i /= 0)
 746                 if b then do maybe_layout keyword
 747                              return (L span keyword)
 748                      else return (L span (ITvarid fs))
 749         _other -> return (L span (ITvarid fs))
 750   where
 751         fs = lexemeToFastString buf len
 752
 753 conid buf len = ITconid fs
 754   where fs = lexemeToFastString buf len
 755
 756 qvarsym buf len = ITqvarsym $! splitQualName buf len
 757 qconsym buf len = ITqconsym $! splitQualName buf len
 758
 759 varsym = sym ITvarsym
 760 consym = sym ITconsym
 761
 762 sym con span buf len =
 763   case lookupUFM reservedSymsFM fs of
 764         Just (keyword,0)    -> return (L span keyword)
 765         Just (keyword,exts) -> do
 766                 b <- extension (\i -> exts .&. i /= 0)
 767                 if b then return (L span keyword)
 768                      else return (L span $! con fs)
 769         _other -> return (L span $! con fs)
 770   where
 771         fs = lexemeToFastString buf len
 772
 773 tok_decimal span buf len
 774   = return (L span (ITinteger  $! parseInteger buf len 10 octDecDigit))
 775
 776 tok_octal span buf len
 777   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 8 octDecDigit))
 778
 779 tok_hexadecimal span buf len
 780   = return (L span (ITinteger  $! parseInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
 781
 782 prim_decimal span buf len
 783   = return (L span (ITprimint  $! parseInteger buf (len-1) 10 octDecDigit))
 784
 785 prim_octal span buf len
 786   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 8 octDecDigit))
 787
 788 prim_hexadecimal span buf len
 789   = return (L span (ITprimint  $! parseInteger (offsetBytes 2 buf) (len-3) 16 hexDigit))
 790
 791 tok_float        str = ITrational   $! readRational str
 792 prim_float       str = ITprimfloat  $! readRational str
 793 prim_double      str = ITprimdouble $! readRational str
 794
 795 -- -----------------------------------------------------------------------------
 796 -- Layout processing
 797
 798 -- we're at the first token on a line, insert layout tokens if necessary
 799 do_bol :: Action
 800 do_bol span _str _len = do
 801         pos <- getOffside
 802         case pos of
 803             LT -> do
 804                 --trace "layout: inserting '}'" $ do
 805                 popContext
 806                 -- do NOT pop the lex state, we might have a ';' to insert
 807                 return (L span ITvccurly)
 808             EQ -> do
 809                 --trace "layout: inserting ';'" $ do
 810                 popLexState
 811                 return (L span ITsemi)
 812             GT -> do
 813                 popLexState
 814                 lexToken
 815
 816 -- certain keywords put us in the "layout" state, where we might
 817 -- add an opening curly brace.
 818 maybe_layout ITdo       = pushLexState layout_do
 819 maybe_layout ITmdo      = pushLexState layout_do
 820 maybe_layout ITof       = pushLexState layout
 821 maybe_layout ITlet      = pushLexState layout
 822 maybe_layout ITwhere    = pushLexState layout
 823 maybe_layout ITrec      = pushLexState layout
 824 maybe_layout _          = return ()
 825
 826 -- Pushing a new implicit layout context.  If the indentation of the
 827 -- next token is not greater than the previous layout context, then
 828 -- Haskell 98 says that the new layout context should be empty; that is
 829 -- the lexer must generate {}.
 830 --
 831 -- We are slightly more lenient than this: when the new context is started
 832 -- by a 'do', then we allow the new context to be at the same indentation as
 833 -- the previous context.  This is what the 'strict' argument is for.
 834 --
 835 new_layout_context strict span _buf _len = do
 836     popLexState
 837     (AI _ offset _) <- getInput
 838     ctx <- getContext
 839     case ctx of
 840         Layout prev_off : _  |
 841            (strict     && prev_off >= offset  ||
 842             not strict && prev_off > offset) -> do
 843                 -- token is indented to the left of the previous context.
 844                 -- we must generate a {} sequence now.
 845                 pushLexState layout_left
 846                 return (L span ITvocurly)
 847         other -> do
 848                 setContext (Layout offset : ctx)
 849                 return (L span ITvocurly)
 850
 851 do_layout_left span _buf _len = do
 852     popLexState
 853     pushLexState bol  -- we must be at the start of a line
 854     return (L span ITvccurly)
 855
 856 -- -----------------------------------------------------------------------------
 857 -- LINE pragmas
 858
 859 setLine :: Int -> Action
 860 setLine code span buf len = do
 861   let line = parseInteger buf len 10 octDecDigit
 862   setSrcLoc (mkSrcLoc (srcSpanFile span) (fromIntegral line - 1) 0)
 863         -- subtract one: the line number refers to the *following* line
 864   popLexState
 865   pushLexState code
 866   lexToken
 867
 868 setFile :: Int -> Action
 869 setFile code span buf len = do
 870   let file = lexemeToFastString (stepOn buf) (len-2)
 871   setSrcLoc (mkSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
 872   popLexState
 873   pushLexState code
 874   lexToken
 875
 876
 877 -- -----------------------------------------------------------------------------
 878 -- Options, includes and language pragmas.
 879
 880 lex_string_prag :: (String -> Token) -> Action
 881 lex_string_prag mkTok span buf len
 882     = do input <- getInput
 883          start <- getSrcLoc
 884          tok <- go [] input
 885          end <- getSrcLoc
 886          return (L (mkSrcSpan start end) tok)
 887     where go acc input
 888               = if isString input "#-}"
 889                    then do setInput input
 890                            return (mkTok (reverse acc))
 891                    else case alexGetChar input of
 892                           Just (c,i) -> go (c:acc) i
 893                           Nothing -> err input
 894           isString i [] = True
 895           isString i (x:xs)
 896               = case alexGetChar i of
 897                   Just (c,i') | c == x    -> isString i' xs
 898                   _other -> False
 899           err (AI end _ _) = failLocMsgP (srcSpanStart span) end "unterminated options pragma"
 900
 901
 902 -- -----------------------------------------------------------------------------
 903 -- Strings & Chars
 904
 905 -- This stuff is horrible.  I hates it.
 906
 907 lex_string_tok :: Action
 908 lex_string_tok span buf len = do
 909   tok <- lex_string ""
 910   end <- getSrcLoc
 911   return (L (mkSrcSpan (srcSpanStart span) end) tok)
 912
 913 lex_string :: String -> P Token
 914 lex_string s = do
 915   i <- getInput
 916   case alexGetChar' i of
 917     Nothing -> lit_error
 918
 919     Just ('"',i)  -> do
 920         setInput i
 921         glaexts <- extension glaExtsEnabled
 922         if glaexts
 923           then do
 924             i <- getInput
 925             case alexGetChar' i of
 926               Just ('#',i) -> do
 927                    setInput i
 928                    if any (> '\xFF') s
 929                     then failMsgP "primitive string literal must contain only characters <= \'\\xFF\'"
 930                     else let s' = mkZFastString (reverse s) in
 931                          return (ITprimstring s')
 932                         -- mkZFastString is a hack to avoid encoding the
 933                         -- string in UTF-8.  We just want the exact bytes.
 934               _other ->
 935                 return (ITstring (mkFastString (reverse s)))
 936           else
 937                 return (ITstring (mkFastString (reverse s)))
 938
 939     Just ('\\',i)
 940         | Just ('&',i) <- next -> do
 941                 setInput i; lex_string s
 942         | Just (c,i) <- next, is_space c -> do
 943                 setInput i; lex_stringgap s
 944         where next = alexGetChar' i
 945
 946     Just (c, i) -> do
 947         c' <- lex_char c i
 948         lex_string (c':s)
 949
 950 lex_stringgap s = do
 951   c <- getCharOrFail
 952   case c of
 953     '\\' -> lex_string s
 954     c | is_space c -> lex_stringgap s
 955     _other -> lit_error
 956
 957
 958 lex_char_tok :: Action
 959 -- Here we are basically parsing character literals, such as 'x' or '\n'
 960 -- but, when Template Haskell is on, we additionally spot
 961 -- 'x and ''T, returning ITvarQuote and ITtyQuote respectively,
 962 -- but WIHTOUT CONSUMING the x or T part  (the parser does that).
 963 -- So we have to do two characters of lookahead: when we see 'x we need to
 964 -- see if there's a trailing quote
 965 lex_char_tok span buf len = do  -- We've seen '
 966    i1 <- getInput       -- Look ahead to first character
 967    let loc = srcSpanStart span
 968    case alexGetChar' i1 of
 969         Nothing -> lit_error
 970
 971         Just ('\'', i2@(AI end2 _ _)) -> do     -- We've seen ''
 972                   th_exts <- extension thEnabled
 973                   if th_exts then do
 974                         setInput i2
 975                         return (L (mkSrcSpan loc end2)  ITtyQuote)
 976                    else lit_error
 977
 978         Just ('\\', i2@(AI end2 _ _)) -> do     -- We've seen 'backslash
 979                   setInput i2
 980                   lit_ch <- lex_escape
 981                   mc <- getCharOrFail   -- Trailing quote
 982                   if mc == '\'' then finish_char_tok loc lit_ch
 983                                 else do setInput i2; lit_error
 984
 985         Just (c, i2@(AI end2 _ _))
 986                 | not (isAny c) -> lit_error
 987                 | otherwise ->
 988
 989                 -- We've seen 'x, where x is a valid character
 990                 --  (i.e. not newline etc) but not a quote or backslash
 991            case alexGetChar' i2 of      -- Look ahead one more character
 992                 Nothing -> lit_error
 993                 Just ('\'', i3) -> do   -- We've seen 'x'
 994                         setInput i3
 995                         finish_char_tok loc c
 996                 _other -> do            -- We've seen 'x not followed by quote
 997                                         -- If TH is on, just parse the quote only
 998                         th_exts <- extension thEnabled
 999                         let (AI end _ _) = i1
1000                         if th_exts then return (L (mkSrcSpan loc end) ITvarQuote)
1001                                    else do setInput i2; lit_error
1002
1003 finish_char_tok :: SrcLoc -> Char -> P (Located Token)
1004 finish_char_tok loc ch  -- We've already seen the closing quote
1005                         -- Just need to check for trailing #
1006   = do  glaexts <- extension glaExtsEnabled
1007         i@(AI end _ _) <- getInput
1008         if glaexts then do
1009                 case alexGetChar' i of
1010                         Just ('#',i@(AI end _ _)) -> do
1011                                 setInput i
1012                                 return (L (mkSrcSpan loc end) (ITprimchar ch))
1013                         _other ->
1014                                 return (L (mkSrcSpan loc end) (ITchar ch))
1015                 else do
1016                    return (L (mkSrcSpan loc end) (ITchar ch))
1017
1018 lex_char :: Char -> AlexInput -> P Char
1019 lex_char c inp = do
1020   case c of
1021       '\\' -> do setInput inp; lex_escape
1022       c | isAny c -> do setInput inp; return c
1023       _other -> lit_error
1024
1025 isAny c | c > '\xff' = isPrint c
1026         | otherwise  = is_any c
1027
1028 lex_escape :: P Char
1029 lex_escape = do
1030   c <- getCharOrFail
1031   case c of
1032         'a'   -> return '\a'
1033         'b'   -> return '\b'
1034         'f'   -> return '\f'
1035         'n'   -> return '\n'
1036         'r'   -> return '\r'
1037         't'   -> return '\t'
1038         'v'   -> return '\v'
1039         '\\'  -> return '\\'
1040         '"'   -> return '\"'
1041         '\''  -> return '\''
1042         '^'   -> do c <- getCharOrFail
1043                     if c >= '@' && c <= '_'
1044                         then return (chr (ord c - ord '@'))
1045                         else lit_error
1046
1047         'x'   -> readNum is_hexdigit 16 hexDigit
1048         'o'   -> readNum is_octdigit  8 octDecDigit
1049         x | is_digit x -> readNum2 is_digit 10 octDecDigit (octDecDigit x)
1050
1051         c1 ->  do
1052            i <- getInput
1053            case alexGetChar' i of
1054             Nothing -> lit_error
1055             Just (c2,i2) ->
1056               case alexGetChar' i2 of
1057                 Nothing -> do setInput i2; lit_error
1058                 Just (c3,i3) ->
1059                    let str = [c1,c2,c3] in
1060                    case [ (c,rest) | (p,c) <- silly_escape_chars,
1061                                      Just rest <- [maybePrefixMatch p str] ] of
1062                           (escape_char,[]):_ -> do
1063                                 setInput i3
1064                                 return escape_char
1065                           (escape_char,_:_):_ -> do
1066                                 setInput i2
1067                                 return escape_char
1068                           [] -> lit_error
1069
1070 readNum :: (Char -> Bool) -> Int -> (Char -> Int) -> P Char
1071 readNum is_digit base conv = do
1072   i <- getInput
1073   c <- getCharOrFail
1074   if is_digit c
1075         then readNum2 is_digit base conv (conv c)
1076         else do setInput i; lit_error
1077
1078 readNum2 is_digit base conv i = do
1079   input <- getInput
1080   read i input
1081   where read i input = do
1082           case alexGetChar' input of
1083             Just (c,input') | is_digit c -> do
1084                 read (i*base + conv c) input'
1085             _other -> do
1086                 if i >= 0 && i <= 0x10FFFF
1087                    then do setInput input; return (chr i)
1088                    else lit_error
1089
1090 silly_escape_chars = [
1091         ("NUL", '\NUL'),
1092         ("SOH", '\SOH'),
1093         ("STX", '\STX'),
1094         ("ETX", '\ETX'),
1095         ("EOT", '\EOT'),
1096         ("ENQ", '\ENQ'),
1097         ("ACK", '\ACK'),
1098         ("BEL", '\BEL'),
1099         ("BS", '\BS'),
1100         ("HT", '\HT'),
1101         ("LF", '\LF'),
1102         ("VT", '\VT'),
1103         ("FF", '\FF'),
1104         ("CR", '\CR'),
1105         ("SO", '\SO'),
1106         ("SI", '\SI'),
1107         ("DLE", '\DLE'),
1108         ("DC1", '\DC1'),
1109         ("DC2", '\DC2'),
1110         ("DC3", '\DC3'),
1111         ("DC4", '\DC4'),
1112         ("NAK", '\NAK'),
1113         ("SYN", '\SYN'),
1114         ("ETB", '\ETB'),
1115         ("CAN", '\CAN'),
1116         ("EM", '\EM'),
1117         ("SUB", '\SUB'),
1118         ("ESC", '\ESC'),
1119         ("FS", '\FS'),
1120         ("GS", '\GS'),
1121         ("RS", '\RS'),
1122         ("US", '\US'),
1123         ("SP", '\SP'),
1124         ("DEL", '\DEL')
1125         ]
1126
1127 -- before calling lit_error, ensure that the current input is pointing to
1128 -- the position of the error in the buffer.  This is so that we can report
1129 -- a correct location to the user, but also so we can detect UTF-8 decoding
1130 -- errors if they occur.
1131 lit_error = lexError "lexical error in string/character literal"
1132
1133 getCharOrFail :: P Char
1134 getCharOrFail =  do
1135   i <- getInput
1136   case alexGetChar' i of
1137         Nothing -> lexError "unexpected end-of-file in string/character literal"
1138         Just (c,i)  -> do setInput i; return c
1139
1140 -- -----------------------------------------------------------------------------
1141 -- The Parse Monad
1142
1143 data LayoutContext
1144   = NoLayout
1145   | Layout !Int
1146
1147 data ParseResult a
1148   = POk PState a
1149   | PFailed
1150         SrcSpan         -- The start and end of the text span related to
1151                         -- the error.  Might be used in environments which can
1152                         -- show this span, e.g. by highlighting it.
1153         Message         -- The error message
1154
1155 data PState = PState {
1156         buffer     :: StringBuffer,
1157         last_loc   :: SrcSpan,  -- pos of previous token
1158         last_offs  :: !Int,     -- offset of the previous token from the
1159                                 -- beginning of  the current line.
1160                                 -- \t is equal to 8 spaces.
1161         last_len   :: !Int,     -- len of previous token
1162         loc        :: SrcLoc,   -- current loc (end of prev token + 1)
1163         extsBitmap :: !Int,     -- bitmap that determines permitted extensions
1164         context    :: [LayoutContext],
1165         lex_state  :: [Int]
1166      }
1167         -- last_loc and last_len are used when generating error messages,
1168         -- and in pushCurrentContext only.  Sigh, if only Happy passed the
1169         -- current token to happyError, we could at least get rid of last_len.
1170         -- Getting rid of last_loc would require finding another way to
1171         -- implement pushCurrentContext (which is only called from one place).
1172
1173 newtype P a = P { unP :: PState -> ParseResult a }
1174
1175 instance Monad P where
1176   return = returnP
1177   (>>=) = thenP
1178   fail = failP
1179
1180 returnP :: a -> P a
1181 returnP a = P $ \s -> POk s a
1182
1183 thenP :: P a -> (a -> P b) -> P b
1184 (P m) `thenP` k = P $ \ s ->
1185         case m s of
1186                 POk s1 a         -> (unP (k a)) s1
1187                 PFailed span err -> PFailed span err
1188
1189 failP :: String -> P a
1190 failP msg = P $ \s -> PFailed (last_loc s) (text msg)
1191
1192 failMsgP :: String -> P a
1193 failMsgP msg = P $ \s -> PFailed (last_loc s) (text msg)
1194
1195 failLocMsgP :: SrcLoc -> SrcLoc -> String -> P a
1196 failLocMsgP loc1 loc2 str = P $ \s -> PFailed (mkSrcSpan loc1 loc2) (text str)
1197
1198 failSpanMsgP :: SrcSpan -> String -> P a
1199 failSpanMsgP span msg = P $ \s -> PFailed span (text msg)
1200
1201 extension :: (Int -> Bool) -> P Bool
1202 extension p = P $ \s -> POk s (p $! extsBitmap s)
1203
1204 getExts :: P Int
1205 getExts = P $ \s -> POk s (extsBitmap s)
1206
1207 setSrcLoc :: SrcLoc -> P ()
1208 setSrcLoc new_loc = P $ \s -> POk s{loc=new_loc} ()
1209
1210 getSrcLoc :: P SrcLoc
1211 getSrcLoc = P $ \s@(PState{ loc=loc }) -> POk s loc
1212
1213 setLastToken :: SrcSpan -> Int -> P ()
1214 setLastToken loc len = P $ \s -> POk s{ last_loc=loc, last_len=len } ()
1215
1216 data AlexInput = AI SrcLoc {-#UNPACK#-}!Int StringBuffer
1217
1218 alexInputPrevChar :: AlexInput -> Char
1219 alexInputPrevChar (AI _ _ buf) = prevChar buf '\n'
1220
1221 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
1222 alexGetChar (AI loc ofs s)
1223   | atEnd s   = Nothing
1224   | otherwise = adj_c `seq` loc' `seq` ofs' `seq` s' `seq`
1225                 --trace (show (ord c)) $
1226                 Just (adj_c, (AI loc' ofs' s'))
1227   where (c,s') = nextChar s
1228         loc'   = advanceSrcLoc loc c
1229         ofs'   = advanceOffs c ofs
1230
1231         non_graphic     = '\x0'
1232         upper           = '\x1'
1233         lower           = '\x2'
1234         digit           = '\x3'
1235         symbol          = '\x4'
1236         space           = '\x5'
1237         other_graphic   = '\x6'
1238
1239         adj_c
1240           | c <= '\x06' = non_graphic
1241           | c <= '\xff' = c
1242           | otherwise =
1243                 case generalCategory c of
1244                   UppercaseLetter       -> upper
1245                   LowercaseLetter       -> lower
1246                   TitlecaseLetter       -> upper
1247                   ModifierLetter        -> other_graphic
1248                   OtherLetter           -> other_graphic
1249                   NonSpacingMark        -> other_graphic
1250                   SpacingCombiningMark  -> other_graphic
1251                   EnclosingMark         -> other_graphic
1252                   DecimalNumber         -> digit
1253                   LetterNumber          -> other_graphic
1254                   OtherNumber           -> other_graphic
1255                   ConnectorPunctuation  -> other_graphic
1256                   DashPunctuation       -> other_graphic
1257                   OpenPunctuation       -> other_graphic
1258                   ClosePunctuation      -> other_graphic
1259                   InitialQuote          -> other_graphic
1260                   FinalQuote            -> other_graphic
1261                   OtherPunctuation      -> other_graphic
1262                   MathSymbol            -> symbol
1263                   CurrencySymbol        -> symbol
1264                   ModifierSymbol        -> symbol
1265                   OtherSymbol           -> symbol
1266                   Space                 -> space
1267                   _other                -> non_graphic
1268
1269 -- This version does not squash unicode characters, it is used when
1270 -- lexing strings.
1271 alexGetChar' :: AlexInput -> Maybe (Char,AlexInput)
1272 alexGetChar' (AI loc ofs s)
1273   | atEnd s   = Nothing
1274   | otherwise = c `seq` loc' `seq` ofs' `seq` s' `seq`
1275                 --trace (show (ord c)) $
1276                 Just (c, (AI loc' ofs' s'))
1277   where (c,s') = nextChar s
1278         loc'   = advanceSrcLoc loc c
1279         ofs'   = advanceOffs c ofs
1280
1281 advanceOffs :: Char -> Int -> Int
1282 advanceOffs '\n' offs = 0
1283 advanceOffs '\t' offs = (offs `quot` 8 + 1) * 8
1284 advanceOffs _    offs = offs + 1
1285
1286 getInput :: P AlexInput
1287 getInput = P $ \s@PState{ loc=l, last_offs=o, buffer=b } -> POk s (AI l o b)
1288
1289 setInput :: AlexInput -> P ()
1290 setInput (AI l o b) = P $ \s -> POk s{ loc=l, last_offs=o, buffer=b } ()
1291
1292 pushLexState :: Int -> P ()
1293 pushLexState ls = P $ \s@PState{ lex_state=l } -> POk s{lex_state=ls:l} ()
1294
1295 popLexState :: P Int
1296 popLexState = P $ \s@PState{ lex_state=ls:l } -> POk s{ lex_state=l } ls
1297
1298 getLexState :: P Int
1299 getLexState = P $ \s@PState{ lex_state=ls:l } -> POk s ls
1300
1301 -- for reasons of efficiency, flags indicating language extensions (eg,
1302 -- -fglasgow-exts or -fparr) are represented by a bitmap stored in an unboxed
1303 -- integer
1304
1305 glaExtsBit, ffiBit, parrBit :: Int
1306 glaExtsBit = 0
1307 ffiBit     = 1
1308 parrBit    = 2
1309 arrowsBit  = 4
1310 thBit      = 5
1311 ipBit      = 6
1312 tvBit      = 7  -- Scoped type variables enables 'forall' keyword
1313 bangPatBit = 8  -- Tells the parser to understand bang-patterns
1314                 -- (doesn't affect the lexer)
1315 idxTysBit  = 9  -- indexed type families: 'family' keyword and kind sigs
1316
1317 glaExtsEnabled, ffiEnabled, parrEnabled :: Int -> Bool
1318 glaExtsEnabled flags = testBit flags glaExtsBit
1319 ffiEnabled     flags = testBit flags ffiBit
1320 parrEnabled    flags = testBit flags parrBit
1321 arrowsEnabled  flags = testBit flags arrowsBit
1322 thEnabled      flags = testBit flags thBit
1323 ipEnabled      flags = testBit flags ipBit
1324 tvEnabled      flags = testBit flags tvBit
1325 bangPatEnabled flags = testBit flags bangPatBit
1326 idxTysEnabled  flags = testBit flags idxTysBit
1327
1328 -- PState for parsing options pragmas
1329 --
1330 pragState :: StringBuffer -> SrcLoc -> PState
1331 pragState buf loc  =
1332   PState {
1333       buffer     = buf,
1334       last_loc   = mkSrcSpan loc loc,
1335       last_offs  = 0,
1336       last_len   = 0,
1337       loc        = loc,
1338       extsBitmap = 0,
1339       context    = [],
1340       lex_state  = [bol, option_prags, 0]
1341     }
1342
1343
1344 -- create a parse state
1345 --
1346 mkPState :: StringBuffer -> SrcLoc -> DynFlags -> PState
1347 mkPState buf loc flags  =
1348   PState {
1349       buffer     = buf,
1350       last_loc   = mkSrcSpan loc loc,
1351       last_offs  = 0,
1352       last_len   = 0,
1353       loc        = loc,
1354       extsBitmap = fromIntegral bitmap,
1355       context    = [],
1356       lex_state  = [bol, if glaExtsEnabled bitmap then glaexts else 0]
1357         -- we begin in the layout state if toplev_layout is set
1358     }
1359     where
1360       bitmap =     glaExtsBit `setBitIf` dopt Opt_GlasgowExts flags
1361                .|. ffiBit     `setBitIf` dopt Opt_FFI         flags
1362                .|. parrBit    `setBitIf` dopt Opt_PArr        flags
1363                .|. arrowsBit  `setBitIf` dopt Opt_Arrows      flags
1364                .|. thBit      `setBitIf` dopt Opt_TH          flags
1365                .|. ipBit      `setBitIf` dopt Opt_ImplicitParams flags
1366                .|. tvBit      `setBitIf` dopt Opt_ScopedTypeVariables flags
1367                .|. bangPatBit `setBitIf` dopt Opt_BangPatterns flags
1368                .|. idxTysBit  `setBitIf` dopt Opt_IndexedTypes flags
1369       --
1370       setBitIf :: Int -> Bool -> Int
1371       b `setBitIf` cond | cond      = bit b
1372                         | otherwise = 0
1373
1374 getContext :: P [LayoutContext]
1375 getContext = P $ \s@PState{context=ctx} -> POk s ctx
1376
1377 setContext :: [LayoutContext] -> P ()
1378 setContext ctx = P $ \s -> POk s{context=ctx} ()
1379
1380 popContext :: P ()
1381 popContext = P $ \ s@(PState{ buffer = buf, context = ctx,
1382                            loc = loc, last_len = len, last_loc = last_loc }) ->
1383   case ctx of
1384         (_:tl) -> POk s{ context = tl } ()
1385         []     -> PFailed last_loc (srcParseErr buf len)
1386
1387 -- Push a new layout context at the indentation of the last token read.
1388 -- This is only used at the outer level of a module when the 'module'
1389 -- keyword is missing.
1390 pushCurrentContext :: P ()
1391 pushCurrentContext = P $ \ s@PState{ last_offs=offs, last_len=len, context=ctx } ->
1392   POk s{context = Layout (offs-len) : ctx} ()
1393
1394 getOffside :: P Ordering
1395 getOffside = P $ \s@PState{last_offs=offs, context=stk} ->
1396                 let ord = case stk of
1397                         (Layout n:_) -> compare offs n
1398                         _            -> GT
1399                 in POk s ord
1400
1401 -- ---------------------------------------------------------------------------
1402 -- Construct a parse error
1403
1404 srcParseErr
1405   :: StringBuffer       -- current buffer (placed just after the last token)
1406   -> Int                -- length of the previous token
1407   -> Message
1408 srcParseErr buf len
1409   = hcat [ if null token
1410              then ptext SLIT("parse error (possibly incorrect indentation)")
1411              else hcat [ptext SLIT("parse error on input "),
1412                         char '`', text token, char '\'']
1413     ]
1414   where token = lexemeToString (offsetBytes (-len) buf) len
1415
1416 -- Report a parse failure, giving the span of the previous token as
1417 -- the location of the error.  This is the entry point for errors
1418 -- detected during parsing.
1419 srcParseFail :: P a
1420 srcParseFail = P $ \PState{ buffer = buf, last_len = len,
1421                             last_loc = last_loc } ->
1422     PFailed last_loc (srcParseErr buf len)
1423
1424 -- A lexical error is reported at a particular position in the source file,
1425 -- not over a token range.
1426 lexError :: String -> P a
1427 lexError str = do
1428   loc <- getSrcLoc
1429   i@(AI end _ buf) <- getInput
1430   reportLexError loc end buf str
1431
1432 -- -----------------------------------------------------------------------------
1433 -- This is the top-level function: called from the parser each time a
1434 -- new token is to be read from the input.
1435
1436 lexer :: (Located Token -> P a) -> P a
1437 lexer cont = do
1438   tok@(L _ tok__) <- lexToken
1439   --trace ("token: " ++ show tok__) $ do
1440   cont tok
1441
1442 lexToken :: P (Located Token)
1443 lexToken = do
1444   inp@(AI loc1 _ buf) <- getInput
1445   sc <- getLexState
1446   exts <- getExts
1447   case alexScanUser exts inp sc of
1448     AlexEOF -> do let span = mkSrcSpan loc1 loc1
1449                   setLastToken span 0
1450                   return (L span ITeof)
1451     AlexError (AI loc2 _ buf) -> do
1452         reportLexError loc1 loc2 buf "lexical error"
1453     AlexSkip inp2 _ -> do
1454         setInput inp2
1455         lexToken
1456     AlexToken inp2@(AI end _ buf2) len t -> do
1457         setInput inp2
1458         let span = mkSrcSpan loc1 end
1459         let bytes = byteDiff buf buf2
1460         span `seq` setLastToken span bytes
1461         t span buf bytes
1462
1463 reportLexError loc1 loc2 buf str
1464   | atEnd buf = failLocMsgP loc1 loc2 (str ++ " at end of input")
1465   | otherwise =
1466   let
1467         c = fst (nextChar buf)
1468   in
1469   if c == '\0' -- decoding errors are mapped to '\0', see utf8DecodeChar#
1470     then failLocMsgP loc2 loc2 (str ++ " (UTF-8 decoding error)")
1471     else failLocMsgP loc1 loc2 (str ++ " at character " ++ show c)
1472 }