more changes

[nestedvm.git] / doc / nestedvm.ivme04.tex
diff --git a/doc/nestedvm.ivme04.tex b/doc/nestedvm.ivme04.tex

index 5ae5cfe..c540268 100644 (file)
--- a/doc/nestedvm.ivme04.tex
+++ b/doc/nestedvm.ivme04.tex
@@ -33,7 +33,7 @@ NestedVM: Total Translation of Native Code into Safe Bytecode
  
  \begin{abstract}
  
-We present a new approach to utilizing unsafe legacy unsafe code
+We present a new approach to utilizing unsafe legacy code
  within safe virtual machines by compiling to MIPS machine code as an
  intermediate language.  This approach carries N key benefits over
  existing techniques:
@@ -189,7 +189,7 @@ running C/C++ code within a Java VM is shown in Figure~\ref{lattice}.
  
  A number of commercial products and research projects attempt to
  translate C++ code to Java code, preserving the mapping of C++ classes
-to Java classes.  Unfortunately this is problematic since there is no
+to Java classes.  Unfortunately, this is problematic since there is no
  way to do pointer arithmetic except within arrays, and even in that
  case, arithmetic cannot be done in terms of fractional objects.
  
@@ -240,7 +240,7 @@ Translating a legacy library for use within a JVM proceeds as follows:
  \item (Optional) compile the resulting bytecode into a {\it safe}
        native binary using {\tt gcj}.
  
-\item From java code, invoke the {\tt execute()} on the generated
+\item From java code, invoke the {\tt run()} method on the generated
        class.  This is equivalent to the {\tt main()} entry point.
  
  \end{enumerate}
@@ -262,9 +262,13 @@ Machine:
  
  \begin{itemize}
  
-\item The original MIPS ISA supports only 32-bit aligned memory loads
-      and stores.  This allows NestedVM to represent memory as a Java
-      {\tt int[]} without introducing additional overhead.
+%\item The original MIPS ISA supports only 32-bit aligned memory loads
+%      and stores.  This allows NestedVM to represent memory as a Java
+%      {\tt int[]} without introducing additional overhead.
+\item Most of the instructions in the original MIPS ISA operate only on
+      32-bit aligned memory locations. This allows NestedVM to represent
+      memory as a Java {\tt int[]} array without introducing additional 
+      overhead.
  
  \item Unlike its predecessor, the R2000 supports 32-bit by 32-bit
        multiply and divide instructions as well as a single and double
@@ -287,10 +291,6 @@ optimizations, since most Java compilers already do this.  A recurring
  example is the treatment of the {\tt r0} register, which is fixed as
  {\tt 0} in the MIPS ISA.
  
-Now that the binary-to-binary compiler is available, the
-binary-to-source compiler is only useful for generating input to {\tt
-gcj}, as discussed in section FOOBAR.
-
  Lacking the ability to generate specially optimized bytecode
  sequences, a straightforward mapping of the general purpose hardware
  registers to 32 {\tt int} fields was optimal.
@@ -308,7 +308,7 @@ public void run() {
      for(;;) {
          switch(pc) {
              case 0x10000:
-                r29 = r29 ? 32;
+                r29 = r29 - 32;
              case 0x10004:
                  r1 = r4 + r5;
              case 0x10008:
@@ -383,15 +383,15 @@ public void trampoline() {
  \caption{\label{code1} Trampoline transformation necessitated by Java's 64kb method size limit}
  \end{figure*}
  
-Unfortunately Java imposes a 64kb limit on the size of the bytecode
+Unfortunately, Java imposes a 64kb limit on the size of the bytecode
  for a single method.  This presents a problem for NestedVM, and
  necessitates a {\it trampoline transformation}, as shown in
  Figure~\ref{code1}.  With this trampoline in place somewhat large
-binaries can be handled without much difficulty -- fortunately there
+binaries can be handled without much difficulty -- fortunately, there
  is no corresponding limit on the size of a classfile as a whole.
  
  Another interesting problem that was discovered while creating the
-trampoline method was javac and Jikes? inability to properly optimize
+trampoline method was javac and Jikes' inability to properly optimize
  switch statements.  The code in Figure~\ref{lookupswitch} is compiled
  into a comparatively inefficient {\tt LOOKUPSWITCH}, while the code in
  Figure~\ref{tableswitch} is optimized into a {\tt TABLESWITCH}.
@@ -409,33 +409,37 @@ switch(pc&0xffffff00) {
  
  \begin{figure}
  {\footnotesize\begin{verbatim}
-Brian, we're missing the code here... can you put it in?
+switch(pc>>>8) {
+    case 0x1: run_100(); break;
+    case 0x2: run_200(); break;
+    case 0x3: run_300(); break;
+}
  \end{verbatim}}
  \caption{\label{tableswitch} Code which {\it is} optimized into a tableswitch}
  \end{figure}
  
-Javac isn?t smart enough to see the patter in the case values and
+Javac is not smart enough to see the pattern in the case values and
  generates very suboptimal bytecode. Manually doing the shifts
  convinces javac to emit a tableswitch statement, which is
-significantly faster. This change alone nearly doubled the speed of
-the compiled binary.
+significantly faster. This change alone increased the speed of
+the compiled binary by approximately 35\%.
  
  Finding the optimal method size lead to the next big performance
-increase.  It was determined with experimentation that the optimal
-number of MIPS instructions per method is 128 (considering only power
-of two options). Going above or below that lead to performance
+increase.  It was determined through experimentation that the optimal
+number of MIPS instructions per method is 64 or 128 (considering only 
+powers of two). Going above or below that lead to performance
  decreases. This is most likely due to a combination of two factors.
  
  \begin{itemize}
  
  \item The two levels of switch statements jumps have to pass though -
        The first switch statement jumps go through is the trampoline
-      switch statement. This is implemented as a TABLESWITCH in JVM
+      switch statement. This is implemented as a {\tt TABLESWITCH} in JVM
        bytecode so it is very fast. The second level switch statement
        in the individual run\_ methods is implemented as a
-      LOOKUPSWITCH, which is much slower. Using smaller methods puts
-      more burden on the faster TABLESWITCH and less on the slower
-      LOOKUPSWITCH.
+      {\tt LOOKUPSWITCH}, which is much slower. Using smaller methods puts
+      more burden on the faster {\tt TABLESWITCH} and less on the slower
+      {\tt LOOKUPSWITCH}.
  
  \item JIT compilers probably favor smaller methods smaller methods are
        easier to compile and are probably better candidates for JIT
@@ -445,6 +449,13 @@ decreases. This is most likely due to a combination of two factors.
  
  Put a chart in here
  
+Putting more than 256 instructions in each method lead to a severe
+performance penalty. Apparently Hotspot does not handle very large methods
+well. In some tests the simple moving from 256 to 512 instructions per
+method decreased performance by a factor of 10.
+
+Put chart here
+
  The next big optimization was eliminating unnecessary case
  statements. Having case statements before each instruction prevents
  JIT compilers from being able to optimize across instruction
@@ -454,27 +465,27 @@ identified. The sources for possible jump targets come from 3 places.
  
  \begin{itemize}
  
-\item The .text segment ? Every instruction in the text segment in
+\item The .text segment - Every instruction in the text segment is
        scanned for jump targets. Every branch instruction (BEQ, JAL,
        etc) has its destination added to the list of possible branch
        targets. In addition, functions that set the link register have
-      theirpc+8 added to the list (the address that would?ve been put
+      theirpc+8 added to the list (the address that would have been put
        to the link register). Finally, combinations of LUI (Load Upper
        Immediate) of ADDIU (Add Immediate Unsigned) are scanned for
        possible addresses in the text segment. This combination of
        instructions is often used to load a 32-bit word into a
        register.
  
-\item The .data segment ? When GCC generates switch() statements it
+\item The .data segment - When GCC generates switch() statements it
        often uses a jump table stored in the .data
-      segment. Unfortunately gcc doesn?t identify these jump tables in
+      segment. Unfortunately gcc does not identify these jump tables in
        any way. Therefore, the entire .data segment is conservatively
        scanned for possible addresses in the .text segment.
        
-\item The symbol table ? This is mainly used as a backup. Scanning the
+\item The symbol table - This is mainly used as a backup. Scanning the
        .text and .data segments should identify any possible jump
        targets but adding every function in the symbol table in the ELF
-      binary doesn?t hurt. This will also catch functions that are
+      binary does not hurt. This will also catch functions that are
        never called directly from the MIPS binary (for example,
        functions called with the call() method in the runtime).
  
@@ -483,12 +494,12 @@ identified. The sources for possible jump targets come from 3 places.
  Eliminating unnecessary case statements provided a 10-25\% speed
  increase.
  
-Despite all the above optimizations and workaround an impossible to
+Despite all the above optimizations and workarounds an impossible to
  workaround hard classfile limit was eventually hit, the constant
  pool. The constant pool in classfiles is limited to 65536
-entries. Every Integer with a magnitude greater than 32767 requires an
+entries. Every integer with a magnitude greater than 32767 requires an
  entry in the constant pool. Every time the compiler emits a
-jump/branch instruction the PC field is set to the branch target. This
+jump or branch instruction the PC field is set to the branch target. This
  means nearly every branch instruction requires an entry in the
  constant pool. Large binaries hit this limit fairly quickly. One
  workaround that was employed in the Java source compiler was to
@@ -510,9 +521,9 @@ advantages:
  \begin{itemize}
        
  \item There are little tricks that can be done in JVM bytecode that
-      can?t be done in Java source code.
+      cannot be done in Java source code.
  
-\item Eliminates the time-consuming javac step ? Javac takes a long
+\item Eliminates the time-consuming javac step - Javac takes a long
        time to parse and compile the output from the java source
        compiler.
  
@@ -528,12 +539,12 @@ improvements where made where in the handling of branch instructions
  and in taking advantage of the JVM stack to eliminate unnecessary
  LOADs and STOREs to local variables.
  
-The first obvious optimization that generating bytecode allows for is
-the use of GOTO. Despite the fact that java doesn?t have a GOTO
-keyword a GOTO bytecode does exist and is used heavily in the code
-generates by javac. Unfortunately the java language doesn?t provide
-any way to take advantage of this. As result of this jumps within a
-method were implemented by setting the PC field to the new address and
+The first obvious optimization that generating bytecode allows for is the
+use of GOTO. Despite the fact that Java does not have a GOTO keyword a GOTO
+bytecode does exist and is used heavily in the code generates by javac.
+Unfortunately the java language does not provide any way to take advantage of
+this. As result of this, jumps within a method were implemented in the
+binary-to-source compiler by setting the PC field to the new address and
  making a trip back to the initial switch statement.  In the classfile
  compiler these jumps are implemented as GOTOs directly to the target
  instruction. This saves a costly trip back through the LOOKUPSWITCH
@@ -551,22 +562,22 @@ if(condition) { pc = TARGET; continue; }
  This requires a branch in the JVM regardless of whether the MIPS
  branch is actually taken. If condition is false the JVM has to jump
  over the code to set the PC and go back to the switch block. If
-condition is true the JVM as to jump to the switch block. By
+condition is true the JVM has to jump to the switch block. By
  generating bytecode directly we can make the target of the JVM branch
  statement the actual bytecode of the final destination. In the case
-where the branch isn?t taken the JVM doesn?t need to branch at all.
+where the branch is not taken the JVM does not need to branch at all.
  
  A side affect of the above two optimizations is a solution to the
  excess constant pool entries problem. When jumps are implemented as
-GOTOs and direct branches to the target the PC field doesn?t need to
+GOTOs and direct branches to the target the PC field does not need to
  be set. This eliminates many of the constant pool entries the java
  source compiler requires. The limit is still there however, and given
  a large enough binary it will still be reached.
  
  Delay slots are another area where things are done somewhat
  inefficiently in the Java source compiler. In order to take advantage
-of instructions already in the pipeline MIPS cpu have a ?delay
-slot?. That is, an instruction after a branch or jump instruction that
+of instructions already in the pipeline MIPS cpu have a ``delay
+slot''. That is, an instruction after a branch or jump instruction that
  is executed regardless of whether the branch is taken. This is done
  because by the time the branch or jump instruction is finished being
  processes the next instruction is already ready to be executed and it
@@ -589,18 +600,18 @@ This piece of code is executed as follows
  
  \begin{enumerate}
  
-\item r2 is set to ?1
+\item r2 is set to -1
  
  \item r2 is loaded from the register file by the BLTEZ instruction
        
  \item 10 is added to r2 by the ADDIU instruction
  
  \item The branch is taken because at the time the BLTZ instruction was
-      executed r2 was ?1, but r2 is now 9 (-1 + 10)
+      executed r2 was -1, but r2 is now 9 (-1 + 10)
  
  \end{enumerate}
  
-There is a very element solution to this problem when using JVM
+There is a very elegent solution to this problem when using JVM
  bytecode. When a branch instruction is encountered the registers
  needed for the comparison are pushed onto the stack to prepare for the
  JVM branch instruction. Then, AFTER the values are on the stack the
@@ -611,7 +622,7 @@ registers are not visible to the branch bytecode. This allows delay
  slots to be used with no performance penalty or size penalty.
  
  One final advantage that generating bytecode directly allows is
-smaller more compact bytecode. All the optimization above lead to
+smaller more compact bytecode. All the optimizations above lead to
  smaller bytecode as a side effect. There are also a few other areas
  where the generated bytecode can be optimized for size with more
  knowledge of the program as a whole.
@@ -633,39 +644,152 @@ prepare for a invoke special call. By simple moving this outside the switch
  statement each case arm was reduced in size by one instruction. Similar
  optimizations were also done in other parts of the compiler.
  
-
  \section{Interfacing with Java Code}
  
-Java source code can create a copy of the translated binary by
-instantiating the corresponding class, which extends {\tt Runtime}.
-Invoking the {\tt main()} method on this class is equivalent to
-calling the {\tt main()} function within the binary; the {\tt String}
-arguments to this function are copied into the binary's memory space
-and made available as {\tt argv**} and {\tt argc}.
+NestedVM has two primary ways of executing code, the interpreter, and the
+binary translators. Both the interpreter and the output from the binary
+translators sit on top of a Runtime class. This class provides the public
+interface to both the interpreter and the translated binaries.
  
-The translated binary communicates with the rest of the VM by
-executing MIPS {\tt SYSCALL} instructions, which are translated into
-invocations of the {\tt syscall()} method.  This calls back to the
-native Java world, which can manipulate the binary's environment by
-reading and writing to its memory space, checking its exit status,
-pausing the VM, and restarting the VM.
+\subsection{The Runtime Class}
  
+The Runtime class does the work that the operating system usually does.
+Conceptually the Runtime class can be thought of as the operating system and
+its subclasses (translated binaries and the interpreter) the CPU. The
+Runtime fulfills 5 primary goals:
  
-\subsection{Virtualization}
+\begin{itemize}
  
-The {\tt Runtime} class implements the majority of the standard {\tt
-libc} syscalls, providing a complete interface to the filesystem,
-network socket library, time of day, (Brian: what else goes here?).
+\item Provides a consistent external interface - The method of actually
+executing the code (currently only translated binaries and the interpreter)
+can be changed without any code changes to the caller because only Runtime
+exposes a public interface.
  
-\begin{itemize}
+\item Provide an easy to use interface - The interpreter and the output from
+the binary translators only know how to execute code. The Runtime class
+provides an easy to use interface to the code. It contains methods to pass
+arguments to the main() function, read and write from memory, and call
+individual functions in the binary.
  
-\item ability to provide the same interface to CNI code and
-      NestedVMified code
-      
-\item security advantages (chroot the {\tt fork()}ed process)
+\item Manage the process's memory - The Runtime class contains large int[]
+arrays that represent the process`s entire memory space.  Subclasses read
+and write to these arrays as required by the instructions they are
+executing.  Subclasses can expend their memory space using the sbrk
+syscall.
+
+\item Provide access to the file system and streams - Subclasses access the
+file system through standard UNIX syscalls (read, write, open, etc). The
+Runtime manages the file descriptor table that maps UNIX file descriptors
+to Java RandomAccessFiles, InputStreams, OutputStreams, and sockets.
+
+\item Miscellaneous other syscalls - In additions to those mentioned above
+the Runtime class implements a variety of other syscalls (sleep,
+gettimeofday, getpagesize, sysconf, fcntl, etc).
  
  \end{itemize}
  
+\subsection{Interacting with the Binary}
+
+Java source code can create a copy of the translated binary by instantiating
+the class generated by the binary translator or instantiating the
+interpreter. It can then interact with the process through the many
+facilities provided by the Runtime interface.  Invoking the run() method of
+the Runtime interface will load the given arguments into the process's
+memory as invoke the binaries entry point (typically \_start() in crt0.o).
+This will pass control on to the main() function which will have the
+arguments passed to run() loaded into argv and argc.
+
+As the binary executes it often passes control back to the Runtime class
+through the MIPS {\tt SYSCALL} instruction. The interpreter and translated
+binaries invoke the {\tt syscall()} method of the Runtime class when the
+{\tt SYSCALL} instruction is executed. The Runtime class then can manipulate
+the process's environment (read and write to memory, modify the file
+descriptor table, etc) and interact with the rest of the JVM on behalf of
+the process (read and write to a file or stream, etc). There is even a
+syscall to pause the VM and temporarily return control to the caller.
+
+In addition to the interfaces provided by NestedVM, users can create their
+own interfaces between the MIPS and Java world. The Runtime provides a
+method called call() that will call a function by name in the MIPS binary.
+The call() method looks up the function name in the binary's ELF symbol
+table and manipulating the stack and registers accordingly to execute the
+given function. This allows Java code to seamlessly invoke functions in the
+binary.
+
+{\footnotesize\begin{verbatim}
+// Java
+private Runtime rt = new MyBinary();
+public void foo(int n) {
+    for(int i=0;i<10;i++) {
+        int result = rt.call("do_work",i);
+        System.err.println("do_work(i) = " + result);
+    }
+}
+// C
+void do_work(int n) {
+    int i;
+    int ret=0;
+    for(i=0;i<n;i++) ret += i;
+    return n;
+}
+\end{verbatim}}
+
+The MIPS binaries can also invoke a special method of Runtime called
+callJava().When the MIPS binary invokes the {\tt CALL\_JAVA} syscall
+(usually done through the {\tt \_call\_java()} function provided by the
+NestedVM support library) the callJava() method in Runtime is invoked with
+the arguments passes to the syscall.
+
+{\footnotesize\begin{verbatim}
+// Java
+private Runtime rt = new MyBinary() {
+    pubilc int callJava(int a, int b, int c, int d) { System.err.println("Got " + a + " " + b);
+};
+public void foo() { rt.run(); }
+// C
+void main(int argc, char **argv) {
+    _call_java(1,2);
+}
+\end{verbatim}}
+
+These two methods can even be combined. MIPS can call Java through the
+CALL\_JAVA syscall, which can in turn invoke a MIPS function in the binary
+with the call() method.
+
+Users preferring a simpler communication mechanism can also use Java
+Stream's and file descriptors. Runtime provides a simple interface for
+mapping a Java Input or OutputStream to a File Descriptor.
+
+%Java source code can create a copy of the translated binary by
+%instantiating the corresponding class, which extends {\tt Runtime}.
+%Invoking the {\tt main()} method on this class is equivalent to
+%calling the {\tt main()} function within the binary; the {\tt String}
+%arguments to this function are copied into the binary's memory space
+%and made available as {\tt **argv} and {\tt argc}.
+
+%The translated binary communicates with the rest of the VM by
+%executing MIPS {\tt SYSCALL} instructions, which are translated into
+%invocations of the {\tt syscall()} method.  This calls back to the
+%native Java world, which can manipulate the binary's environment by
+%reading and writing to its memory space, checking its exit status,
+%pausing the VM, and restarting the VM.
+
+
+%\subsection{Virtualization}
+
+%The {\tt Runtime} class implements the majority of the standard {\tt
+%libc} syscalls, providing a complete interface to the filesystem,
+%network socket library, time of day, (Brian: what else goes here?).
+
+%\begin{itemize}
+
+%\item ability to provide the same interface to CNI code and
+%      NestedVMified code
+      
+%\item security advantages (chroot the {\tt fork()}ed process)
+%
+%\end{itemize}
+
  
  \section{Quantitative Performance}
  
@@ -682,45 +806,119 @@ network socket library, time of day, (Brian: what else goes here?).
  
  \subsection{Optimizations}
  
-Brian, can you write something to go here?  Just mention which
-optimizations helped and which ones hurt.
+Although NestedVM perfectly emulates a MIPS R2000 CPU its performance
+characteristics are not anything like an actual MIPS R2000 CPU. GCC makes
+several optimizations that increase performance on an actually MIPS CPU but
+actually decrease performance when run through the NestedVM binary
+translator. Fortunately, GCC provides many options to customize its code
+generations and eliminate these optimizations. GCC also has optimization
+options that are not helpful on a real MIPS CPU but are very helpful under
+NestedVM
+
+Adam, we should cite "Using the GNU Compiler Collection" somewhere in here.
  
  \begin{itemize}
-\item {\tt trampoline}
-\item {\tt optimal method size}
-\item {\tt -msingle-float}
-\item {\tt -mmemcpy}
-\item {\tt fastmem}
-\item {\tt local vars for registers (useless)}
+
+\item {\tt -falign-functions}
+Normally a function's location in memory has no effect on its execution
+speed. However, in the NestedVM binary translator, the .text segment is
+split up on power of two boundaries. If a function is unlucky enough to
+start near the end of one of these boundaries a performance critical part of
+the function could end up spanning two methods. There is a significant
+amount of overhead in switching between two methods so this must be avoided
+at all costs. By telling GCC to align all functions to the boundary that the
+.text segment is split on the chances of a critical part of a function
+spanning two methods is significantly reduced.
+
  \item {\tt -fno-rename-registers}
-\item {\tt -ffast-math}
-\item {\tt -fno-trapping-math}
-\item {\tt -fsingle-precision-constant}
-\item {\tt -mfused-madd}
-\item {\tt -freg-struct-return}
-\item {\tt -freduce-all-givs}
-\item {\tt -fno-peephole}
-\item {\tt -fno-peephole2}
-\item {\tt -fmove-all-movables}
-\item {\tt -fno-sched-spec-load}
-\item {\tt -fno-sched-spec}
-\item {\tt -fno-schedule-insns}
-\item {\tt -fno-schedule-insns2}
+Some processors can better schedule code when registers are not reused for
+two different purposes. By default GCC will try to use as many registers as
+possibly when it can. This excess use of registers just confuses JIT's
+trying to compile the output from the binary translator. All the JIT
+compilers we tested do much better with a few frequently used registers.
+
  \item {\tt -fno-delayed-branch}
-\item {\tt -fno-function-cse}
-\item {\tt -ffunction-sections}
-\item {\tt -fdata-sections}
-\item {\tt array bounds checking}
-\item {\tt -falign-functions=n}
-\item {\tt -falign-labels=n}
-\item {\tt -falign-loops=n}
-\item {\tt -falign-jumps=n}
-\item {\tt -fno-function-cse}
+The MIPS CPU has a delay slot (see above). Earlier versions of NestedVM did
+not efficiently emulate delay slots. This option causes GCC to avoid using
+delay slots for anything (a NOP is simply placed in the delay slot). This
+had a small performance benefit. However, recent versions of NestedVM
+emulate delay slots with no performance overhead so this options has little
+effect. Nonetheless, these delay slots provide no benefit under NestedVM
+either so they are avoided with this option.
+
+\item {\tt -fno-schedule-insns}
+Load operations in the MIPS ISA also have a delay slot. The results of a
+load operation are not available for use until one instruction later.
+Several other instructions also have similar delay slots. GCC tries to do
+useful work wile waiting for the results of one of these operations by
+default. However, this, like register renaming, tends to confuse JIT
+compilers. This option prevents GCC from going out of its way to take
+advantage of these delay slots and makes the code generated by NestedVM
+easier for JIT compilers to handle.
+
+\item {\tt -mmemcpy}
+GCC sometimes has to copy somewhat large areas of memory. The most common
+example of this is assigning one struct to another. Memory copying can be
+done far more efficiently in Java than under NestedVM. Calls to the memcpy
+libc function are treated specially by the binary translator. They are
+turned into calls to a memcpy method in Runtime. The {\tt -mmemcpy} option
+causes GCC to invoke libc's memcpy() function when it needs to copy a region
+of memory rather than generating its own memcpy code. This call in then
+turned into a call to this Java memcpy function which is significantly
+faster than the MIPS implementation.
+
+\item {\tt -ffunction-sections -fdata-sections}
+These two options are used in conjunction with the {\tt --gc-section} linker
+option. These three options cause the linker to aggressively discard unused
+functions and data sections. In some cases this leads to significantly
+smaller binaries.
+
+%\item {\tt trampoline}
+%\item {\tt optimal method size}
+%\item {\tt -msingle-float}
+%\item {\tt -mmemcpy}
+%\item {\tt fastmem}
+%\item {\tt local vars for registers (useless)}
+%\item {\tt -fno-rename-registers}
+%\item {\tt -ffast-math}
+%\item {\tt -fno-trapping-math}
+%\item {\tt -fsingle-precision-constant}
+%\item {\tt -mfused-madd}
+%\item {\tt -freg-struct-return}
+%\item {\tt -freduce-all-givs}
+%\item {\tt -fno-peephole}
+%\item {\tt -fno-peephole2}
+%\item {\tt -fmove-all-movables}
+%\item {\tt -fno-sched-spec-load}
+%\item {\tt -fno-sched-spec}
+%\item {\tt -fno-schedule-insns}
+%\item {\tt -fno-schedule-insns2}
+%\item {\tt -fno-delayed-branch}
+%\item {\tt -fno-function-cse}
+%\item {\tt -ffunction-sections}
+%\item {\tt -fdata-sections}
+%\item {\tt array bounds checking}
+%\item {\tt -falign-functions=n}
+%\item {\tt -falign-labels=n}
+%\item {\tt -falign-loops=n}
+%\item {\tt -falign-jumps=n}
+%\item {\tt -fno-function-cse}
  \end{itemize}
  
  \section{Future Directions}
  
-World domination.
+\begin{itemize}
+
+\item Better use of local variables in binary-to-binary compiler -- need to
+do data flow analysis to find how how and when registers are used and avoid
+the costly load/restore when it isn't necessary.
+
+\item More advanced Runtime support -- support more syscalls. This will
+allow running large applications such as GCC under NestedVM.
+
+\item World domination
+
+\end{itemize}
  
  \section{Conclusion}