ship: Memory

== Ports ===========================================================
data  in:    inCBD
data  in:    inAddrRead
data  in:    inAddrWrite
data  in:    inDataWrite
data  in:    inStride
data  in:    inCount

data  out:   out

== TeX ==============================================================

The {\tt Memory} ship represents an interface to a storage space,
which can be used to read from it or write to it.  This storage space
might be a fast on-chip cache, off chip DRAM, or perhaps even a disk drive.

There may be multiple {\tt Memory} ships which interface to the same
physical storage space.  An implementation of Fleet must provide
additional documentation to the programmer indicating which {\tt
Memory} ships correspond to which storage spaces.  A single {\tt
Memory} ship may also access a ``virtual storage space'' formed by
concatenating multiple physical storage spaces.

\subsection*{Code Bag Fetch}

When a word appears at the {\tt inCBD} port, it is treated as a {\it
code bag descriptor}, as shown below:

\begin{center}
\setlength{\bitwidth}{3mm}
{\tt
\begin{bytefield}{37}
  \bitheader[b]{36,6,5,0}\\
  \bitbox{31}{Address} 
  \bitbox{6}{size} 
\end{bytefield}
}
\end{center}

When a word arrives at the {\tt inCBD} port, it is treated as a memory
read with {\tt inAddrRead=Address}, {\tt inStride=1}, and {\tt
inCount=size}.

\subsection*{Reading}

When a word is delivered to {\tt inAddrRead}, the word residing in
memory at that address is provided at {\tt out}.

\subsection*{Writing}

When a word is delivered to {\tt inAddrWrite} and {\tt inDataWrite},
the word at {\tt inDataWrite} is written to the address specified by
{\tt inAddrWrite}.  Once the word is successfully committed to memory,
the value {\tt inAddr+inStride} is provided at {\tt out} (that is, the
address of the next word to be written).

\subsection*{To Do}

Stride and count are not implemented.

We need a way to do an ``unordered fetch'' -- a way to tell the memory
unit to retrieve some block of words in any order it likes.  This can
considerably accelerate fetches when the first word of the region is
not cached, but other parts are cached.  This can also be used for
dispatching codebags efficiently -- but how will we make sure that
instructions destined for a given pump are dispatched in the correct
order (source sequence guarantee)?

A more advanced form would be ``unordered fetch of ordered records''
-- the ability to specify a record size (in words), the offset of the
first record, and the number of records to be fetched.  The memory
unit would then fetch the records in any order it likes, but would be
sure to return the words comprising a record in the order in which
they appear in memory.  This feature could be used to solve the source
sequence guarantee problem mentioned in the previous paragraph.

== Fleeterpreter ====================================================
    private long[] mem = new long[0];
    public long readMem(int addr) { return mem[addr]; }
    public void writeMem(int addr, long val) {
        if (addr >= mem.length) {
            long[] newmem = new long[addr * 2 + 1];
            System.arraycopy(mem, 0, newmem, 0, mem.length);
            mem = newmem;
        }
        mem[addr] = val;
    }

    public void dispatch(int addr, int size) {
        for(int i=addr; i<addr+size; i++) {
            Instruction instr = ((Interpreter)getFleet()).readInstruction(readMem(i));
            ((Interpreter)getFleet()).dispatch(instr, i);
        }
    }

    public void boot(byte[] instructions) {
        Interpreter fleet = (Interpreter)getFleet();
        // load the iscratch and take note of the 0-address INCBD
        long launch = 0;
        for(int i=0; i<instructions.length; i+=6) {
            long word = 0;
            for(int j=0; j<6; j++)
                word = (word << 8) | (instructions[i+j] & 0xff);
            writeMem(i/6, word);
            if (i==0) launch = word;
        }

        // dispatch the 0-address INCBD
        int base = (int)(launch >> 6);
        base = base & ~(0xffffffff << 18);
        int size = (int)launch;
        size = size & ~(0xffffffff <<  6);
        dispatch(base, size);
    }

    private long stride = 0;
    private long count = 0;
    private long addr = 0;
    private boolean writing = false;

    public void service() {
        if (box_inCBD.dataReadyForShip()) {
            long val = box_inCBD.removeDataForShip();
            long addr = val >> 6;
            long size = val & 0x3f;
            dispatch((int)addr, (int)size);
        }
        if (count > 0) {
            if (writing) {
              if (box_inDataWrite.dataReadyForShip() && box_out.readyForDataFromShip()) {
                 writeMem((int)addr, box_inDataWrite.removeDataForShip());
                 box_out.addDataFromShip(0);
                 count--;
                 addr += stride;
              }
            } else {
              if (box_out.readyForDataFromShip()) {
                 box_out.addDataFromShip(readMem((int)addr));
                 count--;
                 addr += stride;
              }
            }

        } else if (box_inAddrRead.dataReadyForShip()) {
            addr = box_inAddrRead.removeDataForShip();
            stride = 0;
            count = 1;
            writing = false;

        } else if (box_inAddrWrite.dataReadyForShip()) {
            addr = box_inAddrWrite.peekPacketForShip().value;
            box_inAddrWrite.removeDataForShip();
            stride = 0;
            count = 1;
            writing = true;
        }
    }

== FleetSim ==============================================================

== FPGA ==============================================================
`include "macros.v"
`define BRAM_ADDR_WIDTH 14
`define BRAM_DATA_WIDTH `INSTRUCTION_WIDTH
`define BRAM_NAME some_bram

/* bram.inc */
module `BRAM_NAME(clk, we, a, dpra, di, spo, dpo); 
    input  clk; 
    input  we; 
    input  [(`BRAM_ADDR_WIDTH-1):0] a; 
    input  [(`BRAM_ADDR_WIDTH-1):0] dpra; 
    input  [(`BRAM_DATA_WIDTH-1):0] di; 
    output [(`BRAM_DATA_WIDTH-1):0] spo; 
    output [(`BRAM_DATA_WIDTH-1):0] dpo; 
    reg    [(`BRAM_DATA_WIDTH-1):0] ram [((1<<(`BRAM_ADDR_WIDTH))-1):0];
    reg    [(`BRAM_ADDR_WIDTH-1):0] read_a; 
    reg    [(`BRAM_ADDR_WIDTH-1):0] read_dpra; 
    always @(posedge clk) begin 
        if (we) 
            ram[a] <= di; 
        read_a <= a; 
        read_dpra <= dpra; 
    end
    assign spo = ram[read_a]; 
    assign dpo = ram[read_dpra]; 
endmodule 
/* bram.inc */

module memory (clk, 
               cbd_r,          cbd_a_,         cbd_d,
               in_addr_r,      in_addr_a_,     in_addr_d,
               write_addr_r,   write_addr_a_,  write_addr_d,
               write_data_r,   write_data_a_,  write_data_d,
               stride_r,       stride_a_,      stride_d,
               count_r,        count_a_,       count_d,
               out_r_,         out_a,          out_d_,
               preload_r,      preload_a_,     preload_d,
               ihorn_r_,       ihorn_a,        ihorn_d_,
               dhorn_r_,       dhorn_a,        dhorn_d_
              );

  input  clk;
  `input(in_addr_r,      in_addr_a,     in_addr_a_,     [(2+`DATAWIDTH-1):0],       in_addr_d)
  `input(write_addr_r,   write_addr_a,  write_addr_a_,  [(2+`DATAWIDTH-1):0],       write_addr_d)
  `input(write_data_r,   write_data_a,  write_data_a_,  [(`DATAWIDTH-1):0],         write_data_d)
  `input(stride_r,       stride_a,      stride_a_,      [(`DATAWIDTH-1):0],         stride_d)
  `input(count_r,        count_a,       count_a_,       [(`DATAWIDTH-1):0],         count_d)
  `output(out_r,         out_r_,        out_a,          [(`DATAWIDTH-1):0],         out_d_)
  `input(preload_r,      preload_a,     preload_a_,     [(`DATAWIDTH-1):0],         preload_d)
  `input(cbd_r,          cbd_a,         cbd_a_,         [(`DATAWIDTH-1):0],         cbd_d)
  `output(ihorn_r,       ihorn_r_,      ihorn_a,        [(`PACKET_WIDTH-1):0], ihorn_d_)
  `defreg(ihorn_d_,                                     [(`PACKET_WIDTH-1):0], ihorn_d)
  `output(dhorn_r,       dhorn_r_,      dhorn_a,        [(`PACKET_WIDTH-1):0],      dhorn_d_)
  `defreg(dhorn_d_,                                     [(`PACKET_WIDTH-1):0],      dhorn_d)

  reg ihorn_full;
  initial ihorn_full = 0;
  reg dhorn_full;
  initial dhorn_full = 0;
  reg command_valid;
  initial command_valid = 0;

  reg [(`BRAM_ADDR_WIDTH-1):0]    preload_pos;
  reg [(`BRAM_ADDR_WIDTH-1):0]    preload_size;
  initial preload_size = 0;

  reg [(`BRAM_ADDR_WIDTH-1):0]    current_instruction_read_from;
  reg [(`BRAM_ADDR_WIDTH-1):0]    temp_base;
  reg [(`CODEBAG_SIZE_BITS-1):0]  temp_size;
  reg [(`BRAM_ADDR_WIDTH-1):0]    cbd_base;
  reg [(`CODEBAG_SIZE_BITS-1):0]  cbd_size;
  reg [(`CODEBAG_SIZE_BITS-1):0]  cbd_pos;
  reg [(`INSTRUCTION_WIDTH-1):0]  command;
  reg [(`BRAM_DATA_WIDTH-1):0]    ram [((1<<(`BRAM_ADDR_WIDTH))-1):0];
  reg                             send_done;
  reg                             send_read;

  reg [(`INSTRUCTION_WIDTH-(2+`DESTINATION_ADDRESS_BITS)):0] temp;
  reg [(`DATAWIDTH-1):0]                                     data;

  reg                             write_flag;
  reg [(`BRAM_ADDR_WIDTH-1):0]    in_addr;
  reg [(`BRAM_DATA_WIDTH-1):0]    write_data;

  wire [(`BRAM_DATA_WIDTH-1):0]   ramread;

  reg command_valid_read;
  initial command_valid_read = 0;

  reg launched;
  initial launched = 0;

  some_bram mybram(clk, write_flag, in_addr, current_instruction_read_from, write_data, not_connected, ramread);
  assign out_d_ = ramread;

  always @(posedge clk) begin

    write_flag <= 0;

    if (!in_addr_r && in_addr_a) in_addr_a = 0;
    if (!write_data_r && write_data_a) write_data_a = 0;
    if (!write_addr_r && write_addr_a) write_addr_a = 0;

    if (command_valid_read) begin
      command_valid_read  <= 0;
      command_valid       <= 1;

    end else  if (send_done) begin
      `onwrite(out_r, out_a)
        send_done <= 0;
      end

    end else  if (send_read) begin
      `onwrite(out_r, out_a)
        send_read <= 0;
      end

    end else if (in_addr_r) begin
      in_addr_a                        = 1;
      send_read                       <= 1;
      current_instruction_read_from   <= in_addr_d[(`DATAWIDTH-1):0];

    end else if (write_addr_r && write_data_r) begin
      write_addr_a       = 1;
      write_data_a       = 1;
      send_done         <= 1;
      write_flag        <= 1;
      in_addr           <= write_addr_d[(`DATAWIDTH-1):0];
      write_data        <= write_data_d;

    end else if (ihorn_full && launched) begin
      `onwrite(ihorn_r, ihorn_a)
        ihorn_full <= 0;
      end

    end else if (dhorn_full) begin
      `onwrite(dhorn_r, dhorn_a)
        dhorn_full <= 0;
      end

    end else if (command_valid) begin
      command_valid <= 0;
      command = ramread;
      ihorn_full  <= 1;
      `packet_data(ihorn_d) <= `instruction_data(command);
      `packet_dest(ihorn_d) <= `instruction_dest(command);

    end else if (cbd_pos < cbd_size) begin
      current_instruction_read_from <= cbd_base+cbd_pos;
      command_valid_read            <= 1;
      cbd_pos                       <= cbd_pos + 1;

    end else begin
      `onread(cbd_r, cbd_a)
        cbd_pos       <= 0;
        cbd_size      <= cbd_d[(`CODEBAG_SIZE_BITS-1):0];
        cbd_base      <= cbd_d[(`INSTRUCTION_WIDTH-1):(`CODEBAG_SIZE_BITS)];

      end else begin
        `onread(preload_r, preload_a)
          if (preload_size == 0) begin
            preload_size     <= preload_d;
          end else if (!launched) begin
            write_flag <= 1;
            write_data <= preload_d;
            in_addr <= preload_pos;
            if (preload_pos == 0) begin
              temp_base = preload_d[(`INSTRUCTION_WIDTH-(3+`DESTINATION_ADDRESS_BITS)):(`CODEBAG_SIZE_BITS)];
              temp_size = preload_d[(`CODEBAG_SIZE_BITS-1):0];
            end
            if ((preload_pos+1) == preload_size) begin
              cbd_pos  <= 0;
              cbd_base <= temp_base;
              cbd_size <= temp_size;
              launched <= 1;
            end
            preload_pos      <= preload_pos + 1;
          end
        end
      end
    end
  end
endmodule

  
== Test ==============================================================
// expected output
#expect 12
#expect 13
#expect 14

// ships required in order to run this code
#ship debug          : Debug
#ship memory         : Memory

// instructions not in any codebag are part of the "root codebag"
// which is dispatched when the code is loaded

memory.inCBD:
  literal BOB;
  deliver;

BOB: {
  debug.in:
    literal 12; deliver;
    literal 13; deliver;
    literal 14; deliver;
}


== Constants ========================================================

== Contributors =========================================================
Adam Megacz <megacz@cs.berkeley.edu>