From 35b4af7ef98c04944208a52dd230dd1c8803eed7 Mon Sep 17 00:00:00 2001 From: adam Date: Mon, 15 Mar 2004 17:55:33 -0800 Subject: [PATCH] integrated Brians comments on ivme paper darcs-hash:20040316015533-5007d-68b9a12b0b3fb809a67dd7ee18013bb5eed345e7.gz --- doc/IVME.xls | Bin 0 -> 38912 bytes doc/mips2java.tex | 274 ----------------- doc/nestedvm.ivme04.tex | 708 +++++++++++++++++++++++++++++++++++++++++++ doc/performance.results.txt | 36 --- 4 files changed, 708 insertions(+), 310 deletions(-) create mode 100644 doc/IVME.xls delete mode 100644 doc/mips2java.tex create mode 100644 doc/nestedvm.ivme04.tex delete mode 100644 doc/performance.results.txt diff --git a/doc/IVME.xls b/doc/IVME.xls new file mode 100644 index 0000000000000000000000000000000000000000..e3bf8c7d31c5de9683fad409cc06bc8438c8afe0 GIT binary patch literal 38912 zcmeHQd2|%VnXjH1Nh5*KfG}XrMm9zql12xIu|NWZ790dJk{#Q@fk87!1CnMkGho59 zV>#fLIP1KXc`}-y@eufD4Ky874C=f5@k!qcB!_(S1vTZCJ@o-7pkmUs=<^wRl+ z_$yp6{CYB(RFOj9a@t)(5_k=~-igk0!+YTK;MwN|@Lu>r_#*fj@H63y;b*~@z(WPZ zZ1_3wrSNm%Z-So(KOcSp{LSzS;TOR#hF=1|6n+`J4!<0}4E`4QTj5v0-v+-Dz8roP z{A&0$@N3~K;49&);QjE`@HOzY@OAL(;Mc=%fZqszJNzd2JK*c#H^Xm%Z-8%v-wMAC zemi^<{0{hLcgA;P%s%|Wyr^>Ys7_#qEDvf7fqWQ6jdbrI1ac?hAV#2ES6i2BX8kG6jDdM}xV zEA(O%=jVyrD!*Rf!)%@p;rvjz#97ZvI12G2`28Zv@xL_~=+Rq3oq>*%`c-$^Bo6_N zJZ#oxf-@In?18rBix1;^7G@Tvq5p9#mB)XQxxb{!$XUrVqp$VnYd8;ZrFyrkQRaJ( zJeJ|8few#flx4mrzm&#F> z2a_(@z`wID59Z4(NjV5kaMA)Oe^zN^O&VXRoSl-jUDI{hwKuDva)@{AGF<9ejG+o6 zpzPMNTh`uki~qo?JNB2Gf&HtNdX{4#!yquDkp7l^q2L4dOl7b`U@@4PD*bjk2$R%G zSSn@CTn=hR)(tQwU@jaWE30*rGb6aaTt@6)b%zmAR#67N4wN?@KrNN9MPQ8_D7Thd zty>GN)@>ykV6njVuvVs8V6RWIW~s8FAzkTu7(Os@Qk9zJQ}vpKW!34D7o;&J#2=En zoT$`cP-i@6H5vbVGbwjGr@Bn}v&EBYru^4)ye4!u&u~hfdFVSShd(>xS7nJ`nJG*q!(GQiai-^q)<9;=U%*6NQLlMZzL|RG1PzN_3}7AO)eGcr?&)v`k); zs$QW7LmiP|NKbTy4#mR3(1GfjjsD6b(N0+Gmg2}53-ty=F}fZnu_Y0 z4SM9T4ram~dRw43uD7)5frK87gnJWuJbW?~*AK@cJ$hY5U7cPY?(OJ47Duh0)K-CEh+-kiRwRBEuQX_OtNdcShVy*uK1c}rU>)KAe~T^Z^NtxAeH z9aZ41G@PG+PUO2LubSn{4Q-k|(@zZ3j4Ho3~m2Ff?g#C0qDLGs5u17f2e;ePT% z=ehU@hVwK1qOTF1XK{U7H;TTVev$a$E69HaDco4$=0~G(y#Qxyx@eHcGlQb?q4C3=~eTP7O zIB!< zRloRxs9z1ule-GybJllIwr}m*geia79z*`**Py47XRb&rkFuc6^3VC$C$c?#r|$*5LB(&^ zS)`+Xo<0B0$DVo$*Met+hj{+Vn_l`mp1C5@82cAK+y2E27Zp=E?sYtb{JSw$iqR+yGl&a=zC>QnW;3AP;0{h8NXV_uaU=h$e4bqM2hAL7)z*MdlV@zc~}<*=P8-{v^+k|`w>m^(7w>5 zdZ>~@T%f+dU1&(2yBm3n#KY2#K$MR>2@k^_{JmJjFlusfzg6upsr^{5iygviR3KVE zAhg+naqJEicjki(Fplk0aiF9Xh}Yz9uG+s9;>&U`R+GEA{f*qyjmqOed2DU;82R&M z4(6xbQXoSEMozEH!@5f(2X64oR+orpcD(qz*Iv2YpkiEzd13hr4f)z@MvNOV$DUf? zxo4Ic<3Y@#s{g(<|0S)FoMJx;F{G_%u;z=W-}=a$Q`;L=jMqYgCD>jmM_GdXT_XOv z`)7aoZE|wr4+P#M)MM@l@`?frK_@|c|6h-ntx~$P9Ti<-* zr$0ISsoNS=dGf&WucH3Br?G&S(XbV1;=5U$@KwAVKmxyi5H-3BTPz!|#F5DkSu z+U^l)lQ@N!i}uaKXg7nlCbs%G@qHe3uErRigKm8Q#;BFrH<12uF#Knx!7q^|?A)so zI=EMTuZcrOE;hz!kob*Q>RXDoo1Jf`aSNn`M_p(KYcR{C^N^=yzSZf75w4bLBiFWE z%D_1|$b+>%gP5V*SgSq(337#RdCwkxk~s=y|3&?{Qc9^5yiY z+HS4)PUNqJ$#M$kH=*uFU64@p{*3jq&RY0P^{TPjh8x}9V4T${^o#p4&d_Rd4S||V@Tke7oLjKg#)hqpTxTa{ZOj*6)u2?mR~HZFB^@e0sbF7Djb zt~W(O@!pjQ-Pps|cVkO91Ox^jt4ISS^J1yY79KM%mRnqN#EY~&L?aJ|Tf&=Cunvd2 zQT!>~ms-WL@2`+f%~m1ay=^l0S6aj?OG-t3Ltjs~ejLvj!jaz1W!07bG94Szksuxy zY%aUIy}4pzS;Lm1`sGb~x3}NdxW}-%J&$a2UJxx zwKwUlyLYv<>mXKDwPR12Ue=XJL^oAcJ^0{*l>uh2?1=O*W4x+07Kw&pi4(g)wE_hy zgNa}n>QmL&_(E(j+>zK)R9JsBbYcrg7{sD6JOQcXLxtu@tS5lWmYtDU_#~cU1iJTy z;`LR`k77pg%;gi|j-#POB_5Ro@aV!UQ&=BQfPAO3Iz;sV&C(e{P9sNAVNSx~M5t$L zEEYJCCLLf9jG>^(S{523H;zQWdiR!EzrVgpUXgH4mO8q_q25G_A7ma&V{_B)i3FPh ziIA0e5aBH~et%tszqX>fy4}Bi)4G~X)z$acSIMLi`Hd$)0ndgytyIzXA#}qQwA#Fs z60I^tT3=<#8TE`-f%e8I8a|flH7iT?QAYK+asbI$w#JJidw`;LK^WTU0bWI<+JD+zbibLd?ES5 z%nufR%H5e)?!GKxc{{~Urn`?{s|M6ui|v@cJ^B) zDqqF*1HG@xl7@n;7i zb*cpMRV8BIPyX#c>+kxLfww1(duCO9?ju!-2018^Z&Q#WX_ZK{!K-+gvk^?T;G+IbOS+8SOWm;DB~V_MwT5Vf*xg5U3LULX-y1DkUdg}-!buu-?|se-GE z`05gy%uN_8(M@z!lCLK)bHJ6HU=7GE|j(G^vPdnlf6<}Xt z5M@GRIvGFpf@NJ%^|{_Kh*ML6u2U?Kqgkn0=NiHQ z0rxyLRmQ|wU5Bvw3{6X^&h&ycjX@j{6l5yU^!&qdW4xI`aLV^ zU#8(|*yy>yHTIW}9E)&Hs+lbRtiu2nf2u#z#=j*tBmXuVISn&)KCYiw09WQ$*>Z~F z%E4QfclENjlii#FPn!j7m?ip#!5~gKT{a%d-UtwdOfXAo?=%Kcu5IaQazfwaO!sOo5$^&7#3sz=m0(uTKo(@ZF-E z*4d;Os^;RJ4V*WzHtHL$yV;s)S>&fNh~}2Sge>yLz1(ncUdpO+qVsYNiVX0)!ZbcF zXHhFaFID+)gq|@XoB+M_JzvIKzO2nOZAWJ-b5Y6A$aEWX6|^?dtax|}&6_r76U}?w zYZ!y5ezk|VI${gya$^vOwqFSbk!AUeUez%G{NhnrFM)I_?tM%6)L$VGB~lM5Dl|5% zA|WLv(4a)b@A%(;STpFdU=@qs681jaaLMv(!u!r1)m>{!Em%eVo~sE&m^iE=TSFO? z5IQK^#@3hcyA9%1;OI`H{e3AM<17c3(Qm~vg28N9#z7tAP}n@DehnC9GGZCoSTJk% z#<8Foi)}JDhs{+*GdioBLTJXXV^$}c@hnDMCN!g&ZcM^9#NpLMGj7EqK{R7I%#V@K zjQlBU6U{i&(Yq@^W4Yy5ub_Z-!$`la$W?e_0^hDU zHkr6a`?o9d$u${=OTPsZ3ftd;QP9%i7HH91(O_vE;$&EdxG>_lgeZ%Vad{Stkqgpz zEJl3j!(uU7-%L?&w^-jy;fiX1Gll%{EeNZ{_|~5d-QTwQkCtyo{Ldfv%=3QwqUG(D zzr6a1&EG$v$?rd0M(J{C`I+B;;M*&WR*MhoQ#OkaS#2(5@u5sci;w31q+5LWM1@nK3W&F8C6WrVC$)IbX9CUw_)q4+cbWp z%)wAbD=!yAlWgVrZ0p9vT6wdF(avP${SB;wY*yZTuu9*UR$c@*!sB^orPL<#NwM;r zRZbx*?~B+vpq2ME%xsygJTu*#d9Cl2Tum$QSm@&cGQMzZpLjeMq+H^bilL!lI<e#X76z)kyeee0h{-dkMUT2{GgH9u_dDG$zMOzuY-^AMkfWD7h71XY-7ZGVbOf z#LuxF#<`r&P+?%TBA07uCt?m729~T3AJXP_oFDO{{v0dH(7?*^7Q5M4{=SJcx^SF| zjv}DQUdOM9aN%aNXx($m7sFS^Q!SZ3?XEuwypCVEau-nx?2a)P9{cETFYWE}edkij105%29Pg@H!X2?lJaRap@966Yb;~+L)wg%7DEJzF2P^Gz@$dTfH@^Mh z%0<4X-Ut5IUiu9AF9+jl%H=OjamWjCFFe$$P*_Ib^u6=F57u2Dxoa-)WZ3>Q+(xCSbp`0dnTzL8fXN71|xXGcHksjB3RvaxyBHZBgbk1N~ALM63o+D(=OG_hSo@IWVuFV-FsN5)(& dzq)Om6-PsWPpQw6GI@faj{|_I61ML6+ literal 0 HcmV?d00001 diff --git a/doc/mips2java.tex b/doc/mips2java.tex deleted file mode 100644 index 2c271df..0000000 --- a/doc/mips2java.tex +++ /dev/null @@ -1,274 +0,0 @@ -% -% FIXME: - Add something about size limits on the constant pool -% how we worked around that and the performance impact -% of -o lessconstants -% - Add something about encoding data sections as string constants -% and the UTF8 non-7-bit-ascii penalty -% - -\documentclass{acmconf} -\usepackage{graphicx} -\usepackage{amssymb,amsmath,epsfig,alltt} -\sloppy % better line breaks -\usepackage{palatino} -\usepackage{parskip} -\usepackage{tabularx} -\usepackage{alltt} -\bibliographystyle{alpha} - -\title{\textbf{\textsf{ -Running Legacy C/C++ Libraries in a Pure Java Environment -}}} -\date{} -\author{\begin{tabular}{@{}c@{}} - {\em {Brian Alliet}} \\ \\ - {{\it Affiliation Goes Here}}\relax - \end{tabular}\hskip 1in\begin{tabular}{@{}c@{}} - {\em {Adam Megacz}} \\ \\ - {UC Berkeley}\relax -\end{tabular}} -\begin{document} - -\maketitle - -\begin{abstract} -Abstract -\end{abstract} - -\section{Introduction} - -\subsection{Why would you want to do this?} - -The C programming language has been around for over 30 years now. -There is a huge library of software written in this language. By -contrast, Java has been around for less than ten years. Although it -offers substantial advantages over C, the set of libraries written in -this language still lags behind C/C++. - -The typical solution to this dilemma is to use JNI or CNI to invoke C -code from within a Java VM. Unfortunately, there are a number of -situations in which this is not an acceptable solution: - -\begin{itemize} -\item Java Applets are not permitted to invoke {\tt Runtime.loadLibrary()} -\item Java Servlet containers with a {\tt SecurityManager} will not - permit loading new JNI libraries. This configuration is popular - with {\it shared hosting} providers and corporate intranets - where a number of different parties contribute individual web - applications which are run together in a single container. -\item JNI requires the native library to be compiled ahead of time, - separately, for every architecture on which it will be deployed. - This is unworkable for situations in which the full set of - target architectures is not known at deployment time. -\item The increasingly popular J2ME platform does not support JNI or CNI. -\item Unlike Java Bytecode, JNI code is susceptible to buffer overflow - and heap corruption attacks. This can be a major security - vulnerability. -\item JNI often introduces undesirable added complexity to an - application. -\end{itemize} - -The technique we present here is based on using a typical ANSI C -compiler to compile C/C++ code into a MIPS binary, and then using a -tool to translate that binary on an instruction-by-instruction basis -into Java bytecode. - -The technique presented here is general; we anticipate that it can be -applied to other secure virtual machines such as Microsoft's .NET. - - -\section{Existing Work: Source-to-Source Translation} - -\begin{itemize} -\item c2java -\item commercial products -\end{itemize} - -\section{Mips2Java: Binary-to-Binary Translation} - -We present Mips2Java, a binary-to-binary translation tool to convert -MIPS binaries into Java bytecode files. - -The process of utilizing Mips2Java begins by using any compiler for -any language to compile the source library into a statically linked -MIPS binary. We used {\tt gcc 3.3.3}, but any compiler which can -target the MIPS platform should be acceptable. The binary is -statically linked with a system library (in the case of C code this is -{\tt libc}) which translates system requests (such as {\tt open()}, -{\tt read()}, or {\tt write()}) into appropriate invocations of the -MIPS {\tt SYSCALL} instruction. - -The statically linked MIPS binary is then fed to the Mips2Java tool -(which is itself written in Java), which emits a sequence of Java -Bytecodes in the form of a {\tt .class} file equivalent to the -provided binary. This {\tt .class} file contains a single class which -implements the {\tt Runtime} interface. This class may then be -instantiated; invoking the {\tt execute()} method is equivalent to -invoking the {\tt main()} function in the original binary. - - -\subsection{Why MIPS?} - -We chose MIPS as a source format for two primary reasons: the -availability of tools to translate legacy code into MIPS binaries, and -the close similarity between the MIPS ISA and the Java Virtual Machine. - -The MIPS architecture has been around for quite some time, and is well -supported by the GNU Compiler Collection, which is capable of -compiling C, C++, Java, Fortran, Pascal (with p2c), and Objective C -into MIPS binaries. - -The MIPS R1000 ISA bears a striking similarity to the Java Virtual -Machine. This early revision of the MIPS supports only 32-bit aligned -loads and stores from memory, which is precisely the access model -supported by Java for {\tt int[]}s. - -Cover dynamic page allocation. - -Cover stack setup. - -Brian, are there any other fortunate similarities we should mention -here? I seem to remember there being a bunch, but I can't recall them -right now; it's been a while since I dealt with this stuff in detail. - - -\subsection{Interpreter} - -\begin{itemize} -\item slow, but easy to write -\end{itemize} - - -\subsection{Compiling to Java Source} -\begin{itemize} -\item performance boost -\item pushes the limits of {\tt javac} and {\tt jikes} -\end{itemize} - -\subsection{Compiling directly to Java Bytecode} -\begin{itemize} -\item further performance boost (quantify) -\item brian, can you add any comments here? -\end{itemize} - -\subsection{Interfacing with Java Code} - -Java source code can create a copy of the translated binary by -instantiating the corresponding class, which extends {\tt Runtime}. -Invoking the {\tt main()} method on this class is equivalent to -calling the {\tt main()} function within the binary; the {\tt String} -arguments to this function are copied into the binary's memory space -and made available as {\tt argv**} and {\tt argc}. - -The translated binary communicates with the rest of the VM by -executing MIPS {\tt SYSCALL} instructions, which are translated into -invocations of the {\tt syscall()} method. This calls back to the -native Java world, which can manipulate the binary's environment by -reading and writing to its memory space, checking its exit status, -pausing the VM, and restarting the VM. - - -\subsection{Virtualization} - -The {\tt Runtime} class implements the majority of the standard {\tt -libc} syscalls, providing a complete interface to the filesystem, -network socket library, time of day, (Brian: what else goes here?). - -\begin{itemize} -\item ability to provide the same interface to CNI code and mips2javaified code -\item security advantages (chroot the {\tt fork()}ed process) -\end{itemize} - -\section{Related Work} - -\subsection{Source-to-Source translators} - -A number of commercial products and research projects attempt to -translate C++ code to Java code, preserving the mapping of C++ classes -to Java classes. Unfortunately this is problematic since there is no -way to do pointer arithmetic except within arrays, and even in that -case, arithmetic cannot be done in terms of fractional objects. - -Mention gcc backend - -c2java - -Many of these products advise the user to tweak the code which results -from the translation. Unfortunately, hand-modifying machine-generated -code is generally a bad idea, since this modification cannot be -automated. This means that every time the origin code changes, the -code generator must be re-run, and the hand modifications must be -performed yet again. This is an error-prone process. - -Furthermore, Mips2Java does not attempt to read C code directly. This -frees it from the complex task of faithfully implementing the ANSI C -standard (or, in the case of non ANSI-C compliant code, some other -interface). This also saves the user the chore of altering their -build process to accomodate Mips2Java. - - -\section{Performance} - -\subsection{Charts} - -(Note that none of these libraries have pure-Java equivalents.) - -\begin{itemize} -\item libjpeg -\item libfreetype -\item libmspack -\end{itemize} - - -\subsection{Optimizations} - -Brian, can you write something to go here? Just mention which -optimizations helped and which ones hurt. - -\begin{itemize} -\item trampoline -\item optimal method size -\item -msingle-float -\item -mmemcpy -\item fastmem -\item local vars for registers (useless) -\item -fno-rename-registers -\item -ffast-math -\item -fno-trapping-math -\item -fsingle-precision-constant -\item -mfused-madd -\item -freg-struct-return -\item -freduce-all-givs -\item -fno-peephole -\item -fno-peephole2 -\item -fmove-all-movables -\item -fno-sched-spec-load -\item -fno-sched-spec -\item -fno-schedule-insns -\item -fno-schedule-insns2 -\item -fno-delayed-branch -\item -fno-function-cse -\item -ffunction-sections -\item -fdata-sections -\item array bounds checking -\item -falign-functions=n -\item -falign-labels=n -\item -falign-loops=n -\item -falign-jumps=n -\item -fno-function-cse -\end{itemize} - -\section{Future Directions} - -World domination. - -\section{Conclusion} - -We rock the hizzouse. - -\section{References} - -Yer mom. - -\end{document} - diff --git a/doc/nestedvm.ivme04.tex b/doc/nestedvm.ivme04.tex new file mode 100644 index 0000000..b4c96a1 --- /dev/null +++ b/doc/nestedvm.ivme04.tex @@ -0,0 +1,708 @@ +% +% FIXME: - Add something about size limits on the constant pool +% how we worked around that and the performance impact +% of -o lessconstants +% - Add something about encoding data sections as string constants +% and the UTF8 non-7-bit-ascii penalty +% + +\documentclass{acmconf} +\usepackage{graphicx} +\usepackage{amssymb,amsmath,epsfig,alltt} +\sloppy % better line breaks +\usepackage{palatino} +\usepackage{parskip} +\usepackage{tabularx} +\usepackage{alltt} +\bibliographystyle{alpha} + +\title{\textbf{\textsf{ +Running Legacy C/C++ Libraries in a Pure Java Environment +}}} +\date{} +\author{\begin{tabular}{@{}c@{}} + {\em {Brian Alliet}} \\ \\ + {{\it Affiliation Goes Here}}\relax + \end{tabular}\hskip 1in\begin{tabular}{@{}c@{}} + {\em {Adam Megacz}} \\ \\ + {UC Berkeley}\relax +\end{tabular}} +\begin{document} + +\maketitle + +\begin{abstract} +Abstract +\end{abstract} + +\section{Introduction} + +\subsection{Why would you want to do this?} + +The C programming language has been around for over 30 years now. +There is a huge library of software written in this language. By +contrast, Java has been around for less than ten years. Although it +offers substantial advantages over C, the set of libraries written in +this language still lags behind C/C++. + +The typical solution to this dilemma is to use JNI or CNI to invoke C +code from within a Java VM. Unfortunately, there are a number of +situations in which this is not an acceptable solution: + +\begin{itemize} +\item Java Applets are not permitted to invoke {\tt Runtime.loadLibrary()} +\item Java Servlet containers with a {\tt SecurityManager} will not + permit loading new JNI libraries. This configuration is popular + with {\it shared hosting} providers and corporate intranets + where a number of different parties contribute individual web + applications which are run together in a single container. +\item JNI requires the native library to be compiled ahead of time, + separately, for every architecture on which it will be deployed. + This is unworkable for situations in which the full set of + target architectures is not known at deployment time. +\item The increasingly popular J2ME platform does not support JNI or CNI. +\item Unlike Java Bytecode, JNI code is susceptible to buffer overflow + and heap corruption attacks. This can be a major security + vulnerability. +\item JNI often introduces undesirable added complexity to an + application. +\end{itemize} + +The technique we present here is based on using a typical ANSI C +compiler to compile C/C++ code into a MIPS binary, and then using a +tool to translate that binary on an instruction-by-instruction basis +into Java bytecode. + +The technique presented here is general; we anticipate that it can be +applied to other secure virtual machines such as Microsoft's .NET. + + +\section{Existing Work: Source-to-Source Translation} + +\begin{itemize} +\item c2java +\item commercial products +\end{itemize} + +\section{Mips2Java: Binary-to-Binary Translation} + +We present Mips2Java, a binary-to-binary translation tool to convert +MIPS binaries into Java bytecode files. + +The process of utilizing Mips2Java begins by using any compiler for +any language to compile the source library into a statically linked +MIPS binary. We used {\tt gcc 3.3.3}, but any compiler which can +target the MIPS platform should be acceptable. The binary is +statically linked with a system library (in the case of C code this is +{\tt libc}) which translates system requests (such as {\tt open()}, +{\tt read()}, or {\tt write()}) into appropriate invocations of the +MIPS {\tt SYSCALL} instruction. + +The statically linked MIPS binary is then fed to the Mips2Java tool +(which is itself written in Java), which emits a sequence of Java +Bytecodes in the form of a {\tt .class} file equivalent to the +provided binary. This {\tt .class} file contains a single class which +implements the {\tt Runtime} interface. This class may then be +instantiated; invoking the {\tt execute()} method is equivalent to +invoking the {\tt main()} function in the original binary. + + +\subsection{Why MIPS?} + +We chose MIPS as a source format for two primary reasons: the +availability of tools to translate legacy code into MIPS binaries, and +the close similarity between the MIPS ISA and the Java Virtual Machine. + +The MIPS architecture has been around for quite some time, and is well +supported by the GNU Compiler Collection, which is capable of +compiling C, C++, Java, Fortran, Pascal (with p2c), and Objective C +into MIPS binaries. + +The MIPS R1000 ISA bears a striking similarity to the Java Virtual +Machine. This early revision of the MIPS supports only 32-bit aligned +loads and stores from memory, which is precisely the access model +supported by Java for {\tt int[]}s. + +Cover dynamic page allocation. + +Cover stack setup. + +Brian, are there any other fortunate similarities we should mention +here? I seem to remember there being a bunch, but I can't recall them +right now; it's been a while since I dealt with this stuff in detail. + + +\subsection{Interpreter} + +\begin{itemize} +\item slow, but easy to write +\end{itemize} + + +\subsection{Compiling to Java Source} +\begin{itemize} +\item performance boost +\item pushes the limits of {\tt javac} and {\tt jikes} +\end{itemize} + +\subsection{Compiling directly to Java Bytecode} +\begin{itemize} +\item further performance boost (quantify) +\item brian, can you add any comments here? +\end{itemize} + +\subsection{Interfacing with Java Code} + +Java source code can create a copy of the translated binary by +instantiating the corresponding class, which extends {\tt Runtime}. +Invoking the {\tt main()} method on this class is equivalent to +calling the {\tt main()} function within the binary; the {\tt String} +arguments to this function are copied into the binary's memory space +and made available as {\tt argv**} and {\tt argc}. + +The translated binary communicates with the rest of the VM by +executing MIPS {\tt SYSCALL} instructions, which are translated into +invocations of the {\tt syscall()} method. This calls back to the +native Java world, which can manipulate the binary's environment by +reading and writing to its memory space, checking its exit status, +pausing the VM, and restarting the VM. + + +\subsection{Virtualization} + +The {\tt Runtime} class implements the majority of the standard {\tt +libc} syscalls, providing a complete interface to the filesystem, +network socket library, time of day, (Brian: what else goes here?). + +\begin{itemize} +\item ability to provide the same interface to CNI code and mips2javaified code +\item security advantages (chroot the {\tt fork()}ed process) +\end{itemize} + +\section{Related Work} + +\subsection{Source-to-Source translators} + +A number of commercial products and research projects attempt to +translate C++ code to Java code, preserving the mapping of C++ classes +to Java classes. Unfortunately this is problematic since there is no +way to do pointer arithmetic except within arrays, and even in that +case, arithmetic cannot be done in terms of fractional objects. + +Mention gcc backend + +c2java + +Many of these products advise the user to tweak the code which results +from the translation. Unfortunately, hand-modifying machine-generated +code is generally a bad idea, since this modification cannot be +automated. This means that every time the origin code changes, the +code generator must be re-run, and the hand modifications must be +performed yet again. This is an error-prone process. + +Furthermore, Mips2Java does not attempt to read C code directly. This +frees it from the complex task of faithfully implementing the ANSI C +standard (or, in the case of non ANSI-C compliant code, some other +interface). This also saves the user the chore of altering their +build process to accomodate Mips2Java. + + +\section{Performance} + +\subsection{Charts} + +(Note that none of these libraries have pure-Java equivalents.) + +\begin{itemize} +\item libjpeg +\item libfreetype +\item libmspack +\end{itemize} + + +\subsection{Optimizations} + +Brian, can you write something to go here? Just mention which +optimizations helped and which ones hurt. + +\begin{itemize} +\item trampoline +\item optimal method size +\item -msingle-float +\item -mmemcpy +\item fastmem +\item local vars for registers (useless) +\item -fno-rename-registers +\item -ffast-math +\item -fno-trapping-math +\item -fsingle-precision-constant +\item -mfused-madd +\item -freg-struct-return +\item -freduce-all-givs +\item -fno-peephole +\item -fno-peephole2 +\item -fmove-all-movables +\item -fno-sched-spec-load +\item -fno-sched-spec +\item -fno-schedule-insns +\item -fno-schedule-insns2 +\item -fno-delayed-branch +\item -fno-function-cse +\item -ffunction-sections +\item -fdata-sections +\item array bounds checking +\item -falign-functions=n +\item -falign-labels=n +\item -falign-loops=n +\item -falign-jumps=n +\item -fno-function-cse +\end{itemize} + +\section{Future Directions} + +World domination. + +\section{Conclusion} + +We rock the hizzouse. + +\section{References} + +Yer mom. + +\section{stuff} +\begin{verbatim} + +libjpeg (render thebride_1280.jpg) +Native - 0.235s +JavaSource - 1.86s +ClassFile - 1.37s + +freetype (rendering characters 32-127 of Comic.TTF at sizes from 8 to +48 incrementing by 4) +Native - 0.201s +JavaSource - 2.02s +ClassFile - 1.46s + +Section 3.2 - Interpreter +The Interpreter was the first part of Mips2Java to be written. This was the most +straightforward and simple way to run MIPS binaries inside the JVM. The simplicity of the +interpreter also made it very simple to debug. Debugging machine-generated code is a pain. +Most importantly, the interpreter provided a great reference to use when developing the +compiler. With known working implementations of each MIPS instruction in Java writing a +compiler became a matter of simple doing the same thing ahead of time. +With the completion of the compiler the interpreter in Mips2Java has become less useful. +However, it may still have some life left in it. One possible use is remote debugging with +GDB. Although debugging the compiler generated JVM code is theoretically possible, it +would be far easier to do in the interpreter. The interpreter may also be useful in cases +where size is far more important than speed. The interpreter is very small. The interpreter +and MIPS binary combined are smaller than the compiled classfiles. +Section 3.3 - Compiling to Java Source +The next major step in Mips2JavaÕs development was the Java source compiler. This +generated Java source code (compliable with javac or Jikes) from a MIPS binary. +Generating Java source code was preferred initially over JVM bytecode for two reasons. +The authors werenÕt very familiar with JVM bytecode and therefore generating Java source +code was simpler. Generating source code also eliminated the need to do trivial +optimizations in the Mips2java compiler that javac and Jikes already do. This mainly +includes 2+2=4 stuff. For example, the MIPS register r0 is immutable and always 0. This +register is represented by a static final int in the Java source compiler. Javac and Jikes +automatically handle optimizing this away when possible. In the JVM bytecode compiler +these optimizations needs to be done in Mips2Java. +Early versions of the Mips2Java compiler were very simple. All 32 MIPS GPRs and a +special PC register were fields in the generated java class. There was a run() method +containing all the instructions in the .text segment converted to Java source code. A switch +statement was used to allow jumps from instruction to instruction. The generated code +looked something like this. +private final static int r0 = 0; +private int r1, r2, r3,...,r30; +private int r31 = 0xdeadbeef; +private int pc = ENTRY_POINT; + +public void run() { + for(;;) { + switch(pc) { + case 0x10000: + r29 = r29 Ð 32; + case 0x10004: + r1 = r4 + r5; + case 0x10008: + if(r1 == r6) { + /* delay slot */ + r1 = r1 + 1; + pc = 0x10018: + continue; + } + case 0x1000C: + r1 = r1 + 1; + case 0x10010: + r31 = 0x10018; + pc = 0x10210; + continue; + case 0x10014: + /* nop */ + case 0x10018: + pc = r31; + continue; + ... + case 0xdeadbeef: + System.err.println("Exited from ENTRY_POINT"); + System.err.println("R2: " + r2); + System.exit(1); + } + } +} + +This worked fine for small binaries but as soon as anything substantial was fed to it the 64k +JVM method size limit was soon hit. The solution to this was to break up the code into +many smaller methods. This required a trampoline to dispatch jumps to the appropriate +method. With the addition of the trampoline the generated code looked something like this: +public void run_0x10000() { + for(;;) { + switch(pc) { + case 0x10000: + ... + case 0x10004: + ... + ... + case 0x10010: + r31 = 0x10018; + pc = 0x10210; + return; + ... + } + } +} + +pubic void run_0x10200() { + for(;;) { + switch(pc) { + case 0x10200: + ... + case 0x10204: + ... + } + } +} + +public void trampoline() { + for(;;) { + switch(pc&0xfffffe00) { + case 0x10000: run_0x10000(); break; + case 0x10200: run_0x10200(); break; + case 0xdeadbe00: + ... + } + } +} +With this trampoline in place somewhat large binaries could be handled without much +difficulty. There is no limit on the size of a classfile as a whole, just individual methods. +This method should scale well. However, there are other classfile limitations that will limit +the size of compiled binaries. +Another interesting problem that was discovered while creating the trampoline method was +javac and JikesÕ incredible stupidity when dealing with switch statements. The follow code +fragment gets compiled into a lookupswich by javac: +Switch(pc&0xffffff00) { + Case 0x00000100: run_100(); break; + Case 0x00000200: run_200(); break; + Case 0x00000300: run_300(); break; +} +while this nearly identical code fragment gets compiled to a tableswitch +switch(pc>>>8) { + case 0x1: run_100(); break + case 0x2: run_200(); break; + case 0x3: run_300(); break; +} +Javac isnÕt smart enough to see the patter in the case values and generates very suboptimal +bytecode. Manually doing the shifts convinces javac to emit a tableswitch statement, which +is significantly faster. This change alone nearly doubled the speed of the compiled binary. +Finding the optimal method size lead to the next big performance increase. It was +determined with experimentation that the optimal number of MIPS instructions per method +is 128 (considering only power of two options). Going above or below that lead to +performance decreases. This is most likely due to a combination of two factors. +_ The two levels of switch statements jumps have to pass though Ð The first switch +statement jumps go through is the trampoline switch statement. This is implemented +as a TABLESWITCH in JVM bytecode so it is very fast. The second level switch +statement in the individual run_ methods is implemented as a LOOKUPSWITCH, +which is much slower. Using smaller methods puts more burden on the faster +TABLESWITCH and less on the slower LOOKUPSWITCH. +_ JIT compilers probably favor smaller methods smaller methods are easier to compile +and are probably better candidates for JIT compilation than larger methods. +FIXME: Put a chart here +The next big optimization was eliminating unnecessary case statements. Having case +statements before each instruction prevents JIT compilers from being able to optimize +across instruction boundaries. In order to eliminate unnecessary case statements every +possible address that could be jumped to directly needed to be identified. The sources for +possible jump targets come from 3 places. +_ The .text segment Ð Every instruction in the text segment in scanned for jump +targets. Every branch instruction (BEQ, JAL, etc) has its destination added to the list +of possible branch targets. In addition, functions that set the link register have +theirpc+8 added to the list (the address that wouldÕve been put to the link register). +Finally, combinations of LUI (Load Upper Immediate) of ADDIU (Add Immediate +Unsigned) are scanned for possible addresses in the text segment. This combination +of instructions is often used to load a 32-bit word into a register. +_ The .data segment Ð When GCC generates switch() statements it often uses a jump +table stored in the .data segment. Unfortunately gcc doesnÕt identify these jump +tables in any way. Therefore, the entire .data segment is conservatively scanned for +possible addresses in the .text segment. +_ The symbol table Ð This is mainly used as a backup. Scanning the .text and .data +segments should identify any possible jump targets but adding every function in the +symbol table in the ELF binary doesnÕt hurt. This will also catch functions that are +never called directly from the MIPS binary (for example, functions called with the +call() method in the runtime). +Eliminating unnecessary case statements provided a 10-25% speed increase . +Despite all the above optimizations and workaround an impossible to workaround hard +classfile limit was eventually hit, the constant pool. The constant pool in classfiles is limited +to 65536 entries. Every Integer with a magnitude greater than 32767 requires an entry in the +constant pool. Every time the compiler emits a jump/branch instruction the PC field is set to +the branch target. This means nearly every branch instruction requires an entry in the +constant pool. Large binaries hit this limit fairly quickly. One workaround that was +employed in the Java source compiler was to express constants as offsets from a few central +values. For example: "pc = N_0x00010000 + 0x10" where N_0x000100000 is a non- +final field to prevent javac from inlining it. This was sufficient to get reasonable large +binaries to compile. It has a small (approximately 5%) performance impact on the generated +code. It also makes the generated classfile somewhat larger. Fortunately, the classfile +compiler eliminates this problem. +3.4 Ð Bytecode compiler +The next step in the evolution of Mips2Java was to compile directly to JVM bytecode +eliminating the intermediate javac step. This had several advantages +_ There are little tricks that can be done in JVM bytecode that canÕt be done in Java +source code. +_ Eliminates the time-consuming javac step Ð Javac takes a long time to parse and +compile the output from the java source compiler. +_ Allows for MIPS binaries to be compiled and loaded into a running VM using a +class loader. This eliminates the need to compile the binaries ahead of time. +By generating code at the bytecode level there are many areas where the compiler can be +smarter than javac. Most of the areas where improvements where made where in the +handling of branch instructions and in taking advantage of the JVM stack to eliminate +unnecessary LOADs and STOREs to local variables. +The first obvious optimization that generating bytecode allows for is the use of GOTO. +Despite the fact that java doesnÕt have a GOTO keyword a GOTO bytecode does exist and +is used heavily in the code generates by javac. Unfortunately the java language doesnÕt +provide any way to take advantage of this. As result of this jumps within a method were +implemented by setting the PC field to the new address and making a trip back to the initial +switch statement. In the classfile compiler these jumps are implemented as GOTOs directly +to the target instruction. This saves a costly trip back through the LOOKUPSWITCH +statement and is a huge win for small loops within a method. +Somewhat related to using GOTO is the ability to optimize branch statements. In the Java +source compiler branch statements are implemented as follows (delay slots are ignored for +the purpose of this example) +If(condition) { pc = TARGET; continue; } +This requires a branch in the JVM regardless of whether the MIPS branch is actually taken. +If condition is false the JVM has to jump over the code to set the PC and go back to the +switch block. If condition is true the JVM as to jump to the switch block. By generating +bytecode directly we can make the target of the JVM branch statement the actual bytecode +of the final destination. In the case where the branch isnÕt taken the JVM doesnÕt need to +branch at all. +A side affect of the above two optimizations is a solution to the excess constant pool entries +problem. When jumps are implemented as GOTOs and direct branches to the target the PC +field doesnÕt need to be set. This eliminates many of the constant pool entries the java +source compiler requires. The limit is still there however, and given a large enough binary it +will still be reached. +Delay slots are another area where things are done somewhat inefficiently in the Java source +compiler. In order to take advantage of instructions already in the pipeline MIPS cpu have a +"delay slot". That is, an instruction after a branch or jump instruction that is executed +regardless of whether the branch is taken. This is done because by the time the branch or +jump instruction is finished being processes the next instruction is already ready to be +executed and it is wasteful to discard it. (However, newer MIPS CPUs have pipelines that +are much larger than early MIPS CPUs so they have to discard many instructions anyway.) +As a result of this the instruction in the delay slot is actually executed BEFORE the branch +is taken. To make things even more difficult, values from the register file are loaded +BEFORE the delay slot is executed. Here is a small piece of MIPS assembly: +ADDIU r2,r0,-1 +BLTZ r2, target +ADDIU r2,r2,10 +... +:target +This piece of code is executed as follows +1. r2 is set to Ð1 +2. r2 is loaded from the register file by the BLTEZ instruction +3. 10 is added to r2 by the ADDIU instruction +4. The branch is taken because at the time the BLTZ instruction was executed r2 was +Ð1, but r2 is now 9 (-1 + 10) +There is a very element solution to this problem when using JVM bytecode. When a branch +instruction is encountered the registers needed for the comparison are pushed onto the stack +to prepare for the JVM branch instruction. Then, AFTER the values are on the stack the +delay slot is emitted, and then finally the actual JVM branch instruction. Because the values +were pushed to the stack before the delay slot was executed any changes the delay slot made +to the registers are not visible to the branch bytecode. This allows delay slots to be used +with no performance penalty or size penalty. +One final advantage that generating bytecode directly allows is smaller more compact +bytecode. All the optimization above lead to smaller bytecode as a side effect. There are also +a few other areas where the generated bytecode can be optimized for size with more +knowledge of the program as a whole. +When encountering the following switch block both javac and Jikes generate redundant +bytecode. +Switch(pc>>>8) { + Case 0x1: run_1(); break; + Case 0x2: run_2(); break + ... + case 0x100: run_100(); break; +} +The first bytecode in each case arm in the switch statement is ALOAD_0 to prepare for a +invoke special call. By simple moving this outside the switch statement each case arm was +reduced in size by one instruction. Similar optimizations were also done in other parts of the +compiler. + +Section 3 +- Adam - The method is run(), not execute. Execute() is only used when you need to +resume from a pause syscall. + +Section 3.1 +- Adam - Even the R1000 supports LB/SB/LH/SH/LWL/LWR Ð saying it only supports +32-bit aligned loads is incorrect. +- Other similarities +o All the branching instructions in MIPS directly map to single JVM instructions. +o Most of the ALU instructions map to single JVM instructions. + +(I can write up some stuff for each of these next several sections if you want) +Section 3.2 - Interpreter +- Originally written mainly to understand the MIPS instruction set +- Far easier to debug than an ahead of time compiler (no waiting, can throw in quick +hacks like if(pc >= 0xbadc0de && pc <= 0xbadcfff) debugOutput() ), donÕt need to +understand most of the ELF format) +- Still could be useful +o for GDB remote debugging +o cases where size is more important than speed (size of interpreter + size of mips +binary < size of compiled binary or size of compiler + mips binary) +o code which dynamically generates code (JIT compilers, etc). The ahead of time +compiler canÕt possibly handle this + +Section 3.3 Ð Compiling to Java Source +- First version of an ahead of time compiler +- Huge performance boost +- Java source output preferred for the 2+2=4 type optimizations java compilers do +- Worked well for small binaries Ð large MIPS binaries required ridiculous amounts of +memory to compile and often created invalid classfiles +- Too many constants Ð every jump operation required an entry in the constant pool (pc = +0xabcd1234; continue; ) + +Section 3.4 Ð Compiling directly to JVM Bytecode +- Another jump in performance +- More efficient code can be created at the bytecode level +o Information can be stored on the JVM stack rather than in local variables +_ Javac/jikes often unnecessarily use local variables +long tmp = a*b; +lo = (int)tmp; +hi = (int)(tmp>>>32) +does a store and two loads when a simple DUP would suffice +o GOTO can be used to branch directly to branch destinations in the same method +rather than going through the switch block again. +o BEQ, BGTZ, BLE, etc can jump directly to destination rather than doing +if(condition) { pc=0xabcd1234; continue; } +o Eliminates excess constant pool entries (only jump outside the current method +require a constant pool entry) +o Delay slots implemented more efficiently. +_ Java source compiler does: +if(condition) { /* delay slot /; pc = 0xabcd1234; continue; } +/* delay slot again */ +_ This is required because the delay slot can change the registers used in +condition. The registers need to be read BEFORE the delay slot in executed. +_ In the bytecode compiler the registers used in the condition are pushed to the +stack, then the delay slot is executed, and finally the comparison is done. +This eliminates the needs to output the delay slot twice. +- Smaller bytecode +o Everything mentioned above makes it smaller and faster +o Javac/jikes add redundant code +_ Switch(a) { + Case 1: Run_1000(); break; + Case 2: run_2000(); break; +} +Gets compiled into +1 ÐTableswitch É. +2 Ð ALOAD_0 +3 Ð invokespecial run_1000 +4 Ð goto end +5 Ð ALOAD_0 +6 Ð invokespecial run_2000 +10 Ð goto end +ALOAD_0 is unnecessarily put in each switch arm + +3.5 Interfacing with Java Code +- _call_java ()/Runtime.call() +o _call_java () - Call a java method from mips +o Runime.call() Ð call a mips method from java +o Easily allocate memory within the binaryÕs memory space by calling libcÕs malloc() +o Can go back and forth between mips and java (java calls mips which calls java which +calls back into mips) +- Java Strings can easily be converted to and from null terminated strings in the processÕ +memory space +- Java InputStreams, OutputStreams, and Sockets can easily be turned in to standard +UNIX file descriptors (and ANSI FILE*s) +- Can easily create custom filedescriptors and have full control over all operations on +them (read, write, seek, close, fstat, etc) +- Int User_info[] Ð optional chunk of memory can very easily be accessed from java +(Runtime.getUserInfo/setUserInfo) + +3.6 Virtualization +- Adam Ð we actually donÕt support sockets directly yet Ð you should probably take that +out. (But you can create a socket in java and expose it to mips as a filedescriptor) +- Runtime services +o Provides a easy to use interface to subclasses (Interpreter or compiles binaries) +_ Subclasses only know how to execute instructions +_ Runtime handles setting up registers/stack for execution to begin and +extracting return values and the exit status from the process +o Memory management +_ Sets up stack and guard pages +_ Allocates pages with the sbrk syscall +_ Provide easy an memcpy like interface for accessing the processes memory +from java +o Runtime.call() support Ð sets up registers,etc to prepare the process for a call into it +from java +o Filesystem Ð open/close/read/write/seek/fcntl s syscalls +o Time related functions Ð sleep, gettimeofday, times syscall +o UnixRuntime provides a more complete unix-like environment (Runtime smaller +though) +_ Supports fork() and waitpid() +_ Pipe() for IPC +_ More advocated filesystem interface +_ All filesystem operations abstracted away into a FileSystem class +o FileSystem class can be written that exposes a zip file, +directory on an http server, etc as a filesystem +_ Chdir/getcwd +_ /dev filesystem +_ stat() +_ directory listing support +5.1 Charts +IÕll put together some charts tonight +5.2 Optimizations +And finish this part + +Let me know if this was what you were looking for + + libjpeg libmspack libfreetype +Interpreted MIPS Binary 22.2 12.9 21.4 +Compled MIPS Binary (fastest options) 3.39 2.23 4.31 +Native -O3 0.235 0.084 0.201 + +Compled - with all case statements 3.50 2.30 4.99 +Compiled - with pruned case statement 3.39 2.23 4.31 + +Compiled - 512 instructions/method 62.7 27.7 56.9 +Compiled - 256 instructions/method 3.54 2.55 4.43 +Compiled - 128 instructions/method 3.39 2.23 4.31 +Compiled - 64 instructions/method 3.56 2.31 4.40 +Compiled - 32 instruction/method 3.71 2.46 4.64 + +Compild MIPS Binary (Server VM) 3.21 2.00 4.54 +Compiled MIPS Binary (Client VM) 3.39 2.23 4.31 + +All times are measured in seconds. These were all run on a dual 1ghz G4 +running OS X 10.3.1 with Apple's latest VM (JDK 1.4.1_01-27). Each test +was run 8 times within a single VM. The highest and lowest times were +removed and the remaining 6 were averaged. In each case only the first +run differed significantly from the rest. + +The libjpeg test consisted of decoding a 1280x1024 jpeg +(thebride_1280.jpg) and writing a tga. The mspack test consisted of +extracting all members from arial32.exe, comic32.exe, times32.exe, and +verdan32.exe. The freetype test consisted of rendering characters +32-127 of Comic.TTF at sizes from 8 to 48 incrementing by 4. (That is +about 950 individual glyphs). + +I can provide you with the source for any of these test if you'd like. + +-Brian +\end{verbatim} + +\end{document} + diff --git a/doc/performance.results.txt b/doc/performance.results.txt deleted file mode 100644 index f35f050..0000000 --- a/doc/performance.results.txt +++ /dev/null @@ -1,36 +0,0 @@ -Let me know if this was what you were looking for - - libjpeg libmspack libfreetype -Interpreted MIPS Binary 22.2 12.9 21.4 -Compled MIPS Binary (fastest options) 3.39 2.23 4.31 -Native -O3 0.235 0.084 0.201 - -Compled - with all case statements 3.50 2.30 4.99 -Compiled - with pruned case statement 3.39 2.23 4.31 - -Compiled - 512 instructions/method 62.7 27.7 56.9 -Compiled - 256 instructions/method 3.54 2.55 4.43 -Compiled - 128 instructions/method 3.39 2.23 4.31 -Compiled - 64 instructions/method 3.56 2.31 4.40 -Compiled - 32 instruction/method 3.71 2.46 4.64 - -Compild MIPS Binary (Server VM) 3.21 2.00 4.54 -Compiled MIPS Binary (Client VM) 3.39 2.23 4.31 - -All times are measured in seconds. These were all run on a dual 1ghz G4 -running OS X 10.3.1 with Apple's latest VM (JDK 1.4.1_01-27). Each test -was run 8 times within a single VM. The highest and lowest times were -removed and the remaining 6 were averaged. In each case only the first -run differed significantly from the rest. - -The libjpeg test consisted of decoding a 1280x1024 jpeg -(thebride_1280.jpg) and writing a tga. The mspack test consisted of -extracting all members from arial32.exe, comic32.exe, times32.exe, and -verdan32.exe. The freetype test consisted of rendering characters -32-127 of Comic.TTF at sizes from 8 to 48 incrementing by 4. (That is -about 950 individual glyphs). - -I can provide you with the source for any of these test if you'd like. - --Brian - -- 1.7.10.4