utils/unlit/unlit.c

   1 /* unlit.c                                   Wed Dec  5 17:16:24 GMT 1990
   2  *
   3  * Literate script filter.  In contrast with the format used by most
   4  * programming languages, a literate script is a program in which
   5  * comments are given the leading role, whilst program text must be
   6  * explicitly flagged as such by placing a `>' character in the first
   7  * column on each line.  It is hoped that this style of programming will
   8  * encourage the writing of accurate and clearly documented programs
   9  * in which the writer may include motivating arguments, examples
  10  * and explanations.
  11  *
  12  * Unlit is a filter that can be used to strip all of the comment lines
  13  * out of a literate script file.  The command format for unlit is:
  14  *              unlit [-n] [-q] ifile ofile
  15  * where ifile and ofile are the names of the input (literate script) and
  16  * output (raw program) files respectively.  Either of these names may
  17  * be `-' representing the standard input or the standard output resp.
  18  * A number of rules are used in an attempt to guard against the most
  19  * common errors that are made when writing literate scripts:
  20  * 1) Empty script files are not permitted.  A file in which no lines
  21  *    begin with `>' usually indicates a file in which the programmer
  22  *    has forgotten about the literate script convention.
  23  * 2) A line containing part of program definition (i.e. preceeded by `>')
  24  *    cannot be used immediately before or after a comment line unless
  25  *    the comment line is blank.  This error usually indicates that
  26  *    the `>' character has been omitted from a line in a section of
  27  *    program spread over a number of lines.
  28  * Using the -q (quiet) flag suppresses the signalling of these error
  29  * conditions.  The default behaviour can be selected explicitly using
  30  * the -n (noisy) option so that any potential errors in the script file
  31  * are reported.
  32  *
  33  * The original idea for the use of literate scripts is due to Richard
  34  * Bird of the programming Research Group, Oxford and was initially
  35  * adopted for use in the implementation of the functional programming
  36  * language Orwell used for teaching in Oxford.  This idea has subsequently
  37  * been borrowed in a number of other language implementations.
  38  *
  39  * Modified to understand \begin{code} ... \end{code} used in Glasgow.  -- LA
  40  * And \begin{pseudocode} ... \end{pseudocode}.  -- LA
  41  */
  42
  43 #include <string.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <ctype.h>
  47
  48 #define NULLSTR        ((char *)0)
  49 #define DEFNCHAR       '>'
  50 #define MISSINGBLANK   "unlit: Program line next to comment"
  51 #define EMPTYSCRIPT    "unlit: No definitions in file (perhaps you forgot the '>'s?)"
  52 #define USAGE          "usage: unlit [-q] [-n] [-c] [-#] [-P] [-h label] file1 file2\n"
  53 #define CANNOTOPEN     "unlit: cannot open \"%s\"\n"
  54 #define CANNOTWRITE    "unlit: error writing \"%s\"\n"
  55 #define CANNOTWRITESTDOUT "unlit: error writing standard output\n"
  56 #define DISTINCTNAMES  "unlit: input and output filenames must differ\n"
  57 #define MISSINGENDCODE "unlit: missing \\end{code}\n"
  58
  59 #define BEGINCODE "\\begin{code}"
  60 #define LENBEGINCODE 12
  61 #define ENDCODE "\\end{code}"
  62 #define LENENDCODE 10
  63 #ifdef PSEUDOCODE
  64 /* According to Will Partain, the inventor of pseudocode, this gone now. */
  65 #define MISSINGENDPSEUDOCODE "unlit: missing \\end{pseudocode}\n"
  66 #define BEGINPSEUDOCODE "\\begin{pseudocode}"
  67 #define LENBEGINPSEUDOCODE 18
  68 #define ENDPSEUDOCODE "\\end{pseudocode}"
  69 #define LENENDPSEUDOCODE 16
  70 #endif
  71
  72 typedef enum { START, BLANK, TEXT, DEFN, BEGIN, /*PSEUDO,*/ END, HASH, SHEBANG } line;
  73 #define isWhitespace(c)  (c==' '  || c=='\t' || c=='\r')
  74 #define isLineTerm(c)    (c=='\n' || c==EOF)
  75
  76 static int noisy  = 1;   /* 0 => keep quiet about errors, 1 => report errors */
  77 static int errors = 0;   /* count the number of errors reported              */
  78 static int crunchnl = 0; /* don't print \n for removed lines                 */
  79 static int leavecpp = 1; /* leave preprocessor lines */
  80 static int ignore_shebang = 1; /* Leave out shebang (#!) lines */
  81 static int no_line_pragma = 0; /* Leave out initial line pragma */
  82
  83 static char* prefix_str = NULL; /* Prefix output with a string */
  84
  85 static char *ofilename = NULL;
  86
  87 /* complain(file,line,what)
  88  *
  89  * print error message `what' for `file' at `line'.  The error is suppressed
  90  * if noisy is not set.
  91  */
  92
  93 complain(file, lin, what)
  94 char *file;
  95 char *what;
  96 int lin; {
  97     if (noisy) {
  98         if (file)
  99             fprintf(stderr, "%s ", file);
 100         fprintf(stderr,"line %d: %s\n",lin,what);
 101         errors++;
 102     }
 103 }
 104
 105 writeerror()
 106 {
 107     if (!strcmp(ofilename,"-")) {
 108         fprintf(stderr, CANNOTWRITESTDOUT);
 109     } else {
 110         fprintf(stderr, CANNOTWRITE, ofilename);
 111     }
 112     exit(1);
 113 }
 114
 115 myputc(c, ostream)
 116 char c;
 117 FILE *ostream; {
 118     if (putc(c,ostream) == EOF) {
 119         writeerror();
 120     }
 121 }
 122
 123 #define TABPOS 8
 124
 125 /* As getc, but does TAB expansion */
 126 int
 127 egetc(istream)
 128 FILE *istream;
 129 {
 130     static int spleft = 0;
 131     static int linepos = 0;
 132     int c;
 133
 134     if (spleft > 0) {
 135         spleft--;
 136         linepos++;
 137         return ' ';
 138     }
 139     c = getc(istream);
 140     if (c == EOF)
 141         return c;
 142     else if (c == '\n' || c == '\f') {
 143         linepos = 0;
 144         return c;
 145     } else if (c == '\t') {
 146         spleft = TABPOS - linepos % TABPOS;
 147         spleft--;
 148         linepos++;
 149         return ' ';
 150     } else {
 151         linepos++;
 152         return c;
 153     }
 154
 155 }
 156
 157 /* readline(istream, ostream)
 158  *
 159  * Read a line from the input stream `istream', and return a value
 160  * indicating whether that line was:
 161  *     BLANK (whitespace only),
 162  *     DEFN  (first character is DEFNCHAR),
 163  *     TEXT  (a line of text)
 164  *     BEGIN (a \begin{code} line)
 165  *     PSEUDO (a \begin{pseodocode} line)
 166  *     HASH  (a preprocessor line)
 167  * or  END   (indicating an EOF).
 168  * Lines of type DEFN are copied to the output stream `ostream'
 169  * (without the leading DEFNCHAR).  BLANK and TEXT lines are
 170  * replaced by empty (i.e. blank lines) in the output stream, so
 171  * that error messages refering to line numbers in the output file
 172  * can also be used to locate the corresponding line in the input
 173  * stream.
 174  */
 175
 176 line readline(istream,ostream)
 177 FILE *istream, *ostream; {
 178     int c, c1;
 179     char buf[100];
 180     int i;
 181
 182     c = egetc(istream);
 183
 184     if (c==EOF)
 185         return END;
 186
 187     if ( c == '#' ) {
 188       if ( ignore_shebang ) {
 189          c1 = egetc(istream);
 190          if ( c1 == '!' ) {
 191            while (c=egetc(istream), !isLineTerm(c)) ;
 192            return SHEBANG;
 193          }
 194          myputc(c, ostream);
 195          c=c1;
 196       }
 197       if ( leavecpp ) {
 198         myputc(c, ostream);
 199         while (c=egetc(istream), !isLineTerm(c))
 200             myputc(c,ostream);
 201         myputc('\n',ostream);
 202         return HASH;
 203       }
 204     }
 205
 206     if (c==DEFNCHAR) {
 207         myputc(' ',ostream);
 208         while (c=egetc(istream), !isLineTerm(c))
 209             myputc(c,ostream);
 210         myputc('\n',ostream);
 211         return DEFN;
 212     }
 213
 214     if (!crunchnl)
 215         myputc('\n',ostream);
 216
 217     while (isWhitespace(c))
 218         c=egetc(istream);
 219     if (isLineTerm(c))
 220         return BLANK;
 221
 222     i = 0;
 223     buf[i++] = c;
 224     while (c=egetc(istream), !isLineTerm(c))
 225         if (i < sizeof buf - 1)
 226             buf[i++] = c;
 227     while(i > 0 && isspace(buf[i-1]))
 228         i--;
 229     buf[i] = 0;
 230     if (strcmp(buf, BEGINCODE) == 0)
 231         return BEGIN;
 232 #ifdef PSEUDOCODE
 233     else if (strcmp(buf, BEGINPSEUDOCODE) == 0)
 234         return PSEUDO;
 235 #endif
 236     else
 237         return TEXT;
 238 }
 239
 240
 241 /* unlit(file,istream,ostream)
 242  *
 243  * Copy the file named `file', accessed using the input stream `istream'
 244  * to the output stream `ostream', removing any comments and checking
 245  * for bad use of literate script features:
 246  *  - there should be at least one BLANK line between a DEFN and TEXT
 247  *  - there should be at least one DEFN line in a script.
 248  */
 249
 250 unlit(file, istream, ostream)
 251 char *file;
 252 FILE *istream;
 253 FILE *ostream; {
 254     line last, this=START;
 255     int  linesread=0;
 256     int  defnsread=0;
 257
 258     do {
 259         last = this;
 260         this = readline(istream, ostream);
 261         linesread++;
 262         if (this==DEFN)
 263             defnsread++;
 264         if (last==DEFN && this==TEXT)
 265             complain(file, linesread-1, MISSINGBLANK);
 266         if (last==TEXT && this==DEFN)
 267             complain(file, linesread, MISSINGBLANK);
 268         if (this == BEGIN) {
 269             /* start of code, copy to end */
 270             char lineb[1000];
 271             for(;;) {
 272                 if (fgets(lineb, sizeof lineb, istream) == NULL) {
 273                     complain(file, linesread, MISSINGENDCODE);
 274                     exit(1);
 275                 }
 276                 linesread++;
 277                 if (strncmp(lineb,ENDCODE,LENENDCODE) == 0) {
 278                     myputc('\n', ostream);
 279                     break;
 280                 }
 281                 fputs(lineb, ostream);
 282             }
 283             defnsread++;
 284         }
 285 #ifdef PSEUDOCODE
 286         if (this == PSEUDO) {
 287             char lineb[1000];
 288             for(;;) {
 289                 if (fgets(lineb, sizeof lineb, istream) == NULL) {
 290                     complain(file, linesread, MISSINGENDPSEUDOCODE);
 291                     exit(1);
 292                 }
 293                 linesread++;
 294                 myputc('\n', ostream);
 295                 if (strncmp(lineb,ENDPSEUDOCODE,LENENDPSEUDOCODE) == 0) {
 296                     break;
 297                 }
 298             }
 299         }
 300 #endif
 301     } while(this!=END);
 302
 303     if (defnsread==0)
 304         complain(file,linesread,EMPTYSCRIPT);
 305 }
 306
 307 /* main(argc, argv)
 308  *
 309  * Main program.  Processes command line arguments, looking for leading:
 310  *  -q  quiet mode - do not complain about bad literate script files
 311  *  -n  noisy mode - complain about bad literate script files.
 312  *  -r  remove cpp droppings in output.
 313  *  -P  don't output any CPP line pragmas.
 314  * Expects two additional arguments, a file name for the input and a file
 315  * name for the output file.  These two names must normally be distinct.
 316  * An exception is made for the special name "-" which can be used in either
 317  * position to specify the standard input or the standard output respectively.
 318  */
 319
 320 main(argc,argv)
 321 int argc;
 322 char **argv; {
 323     FILE *istream, *ostream;
 324     char *file;
 325
 326     for (argc--, argv++; argc > 0; argc--, argv++)
 327         if (strcmp(*argv,"-n")==0)
 328             noisy = 1;
 329         else if (strcmp(*argv,"-q")==0)
 330             noisy = 0;
 331         else if (strcmp(*argv,"-c")==0)
 332             crunchnl = 1;
 333         else if (strcmp(*argv,"-P")==0)
 334             no_line_pragma = 1;
 335         else if (strcmp(*argv,"-h")==0) {
 336           if (argc > 1) {
 337             argc--; argv++;
 338             if (prefix_str)
 339               free(prefix_str);
 340             prefix_str = (char*)malloc(sizeof(char)*(1+strlen(*argv)));
 341             if (prefix_str)
 342               strcpy(prefix_str, *argv);
 343           }
 344         } else if (strcmp(*argv,"-#")==0)
 345             ignore_shebang = 0;
 346         else
 347             break;
 348
 349     if (argc!=2) {
 350         fprintf(stderr, USAGE);
 351         exit(1);
 352     }
 353
 354     if (strcmp(argv[0],argv[1])==0 && strcmp(argv[0],"-")!=0) {
 355         fprintf(stderr, DISTINCTNAMES);
 356         exit(1);
 357     }
 358
 359     file = argv[0];
 360     if (strcmp(argv[0], "-")==0) {
 361         istream = stdin;
 362         file    = "stdin";
 363     }
 364     else
 365         if ((istream=fopen(argv[0], "r")) == NULL) {
 366             fprintf(stderr, CANNOTOPEN, argv[0]);
 367             exit(1);
 368         }
 369
 370     ofilename=argv[1];
 371     if (strcmp(argv[1], "-")==0)
 372         ostream = stdout;
 373     else
 374         if ((ostream=fopen(argv[1], "w")) == NULL)  {
 375             fprintf(stderr, CANNOTOPEN, argv[1]);
 376             exit(1);
 377         }
 378
 379     /* Prefix the output with line pragmas */
 380     if (!no_line_pragma && prefix_str) {
 381       /* Both GHC and CPP understand the #line pragma.
 382        * We used to throw in both a #line and a {-# LINE #-} pragma
 383        * here, but CPP doesn't understand {-# LINE #-} so it thought
 384        * the line numbers were off by one.  We could put the {-# LINE
 385        * #-} before the #line, but there's no point since GHC
 386        * understands #line anyhow.  --SDM 8/2003
 387        */
 388       fprintf(ostream, "#line 1 \"%s\"\n", prefix_str);
 389     }
 390
 391     unlit(file, istream, ostream);
 392
 393     if (istream != stdin) fclose(istream);
 394     if (ostream != stdout) {
 395         if (fclose(ostream) == EOF) {
 396             writeerror();
 397         }
 398     }
 399
 400     exit(errors==0 ? 0 : 1);
 401 }