This lesson starts at commit c2c6ad9e07ff149a1de3863f1d10db636e966997.

8. Memory

We'll start with a simple implementation of the memory subsystem, which we need for the load and store instructions. There is quite a lot which we'll need to do for this module, so we'll start on familiar ground and take small steps.

We'll start by implement the store instructions, and specifically, the SW (store word) instruction. The familiar ground we're starting from is the decoder; we'll just do what we have done dozens of times before: Add some decoding logic.

The RISC-V docs say this about the store instructions:

Load and store instructions transfer a value between the registers and memory. [...] The effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. [...] Stores copy the value in register rs2 to memory.

We'll use the first operand to store the address and the second operand to store the value. For now, I'll assume that stores are aligned to a multiple of 4 bytes. The RISC-V specification allows raising exceptions for misaligned memory access (but for now, we will stick to implementing aligned stores, and leave exceptions for later).

src/core/decode_write.vhd CHANGED
@@ -38,6 +38,7 @@ begin
38
  variable j_imm: std_logic_vector(20 downto 0);
39
  variable j_imm_s: std_logic_vector(31 downto 0);
40
  variable s_imm: std_logic_vector(11 downto 0);
 
41
  variable u_imm: std_logic_vector(31 downto 0);
42
 
43
  variable v_decode_output: decode_output_t;
@@ -67,6 +68,7 @@ begin
67
  b_imm_s := std_logic_vector(resize(signed(b_imm), 32));
68
  i_imm_s := std_logic_vector(resize(signed(i_imm), 32));
69
  j_imm_s := std_logic_vector(resize(signed(j_imm), 32));
 
70
 
71
  v_decode_output := DEFAULT_DECODE_OUTPUT;
72
 
@@ -141,12 +143,17 @@ begin
141
  v_decode_output.is_invalid := '1';
142
  end if;
143
  elsif opcode = "0100011" then
 
 
 
 
144
  if funct3 = "000" then
145
  -- TODO: SB
146
  elsif funct3 = "001" then
147
  -- TODO: SH
148
  elsif funct3 = "010" then
149
- -- TODO: SW
 
150
  else
151
  v_decode_output.is_invalid := '1';
152
  end if;
 
38
  variable j_imm: std_logic_vector(20 downto 0);
39
  variable j_imm_s: std_logic_vector(31 downto 0);
40
  variable s_imm: std_logic_vector(11 downto 0);
41
+ variable s_imm_s: std_logic_vector(31 downto 0);
42
  variable u_imm: std_logic_vector(31 downto 0);
43
 
44
  variable v_decode_output: decode_output_t;
 
68
  b_imm_s := std_logic_vector(resize(signed(b_imm), 32));
69
  i_imm_s := std_logic_vector(resize(signed(i_imm), 32));
70
  j_imm_s := std_logic_vector(resize(signed(j_imm), 32));
71
+ s_imm_s := std_logic_vector(resize(signed(s_imm), 32));
72
 
73
  v_decode_output := DEFAULT_DECODE_OUTPUT;
74
 
 
143
  v_decode_output.is_invalid := '1';
144
  end if;
145
  elsif opcode = "0100011" then
146
+ -- store instructions
147
+ v_decode_output.operand1 := std_logic_vector(unsigned(reg(to_integer(unsigned(rs1)))) + unsigned(s_imm_s));
148
+ v_decode_output.operand2 := reg(to_integer(unsigned(rs2)));
149
+
150
  if funct3 = "000" then
151
  -- TODO: SB
152
  elsif funct3 = "001" then
153
  -- TODO: SH
154
  elsif funct3 = "010" then
155
+ -- SW
156
+ v_decode_output.operation := OP_SW;
157
  else
158
  v_decode_output.is_invalid := '1';
159
  end if;
src/core/types.vhd CHANGED
@@ -3,7 +3,27 @@ use ieee.std_logic_1164.all;
3
 
4
 
5
  package core_types is
6
- type operation_t is (OP_ADD, OP_SLT, OP_SLTU, OP_XOR, OP_OR, OP_AND, OP_SLL, OP_SRL, OP_SRA, OP_SUB, OP_JAL, OP_BEQ, OP_BNE, OP_BLT, OP_BGE, OP_BLTU, OP_BGEU, OP_LED);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  type fetch_output_t is record
9
  is_active: std_logic;
 
3
 
4
 
5
  package core_types is
6
+ type operation_t is (
7
+ OP_ADD,
8
+ OP_SLT,
9
+ OP_SLTU,
10
+ OP_XOR,
11
+ OP_OR,
12
+ OP_AND,
13
+ OP_SLL,
14
+ OP_SRL,
15
+ OP_SRA,
16
+ OP_SUB,
17
+ OP_JAL,
18
+ OP_BEQ,
19
+ OP_BNE,
20
+ OP_BLT,
21
+ OP_BGE,
22
+ OP_BLTU,
23
+ OP_BGEU,
24
+ OP_SW,
25
+ OP_LED
26
+ );
27
 
28
  type fetch_output_t is record
29
  is_active: std_logic;

Now we want to start implementing the OP_SW operation in the execute stage.

src/core/execute.vhd CHANGED
@@ -133,6 +133,8 @@ begin
133
  v_jump := '1';
134
  v_jump_address := input.operand3;
135
  end if;
 
 
136
  elsif input.operation = OP_LED then
137
  led <= input.operand1(7 downto 0);
138
  else
 
133
  v_jump := '1';
134
  v_jump_address := input.operand3;
135
  end if;
136
+ elsif input.operation = OP_SW then
137
+ -- TODO: implement
138
  elsif input.operation = OP_LED then
139
  led <= input.operand1(7 downto 0);
140
  else

Hm, we're a bit stuck here. We want to talk to some kind of memory interface or wrapper, which I'll pompously call "memory subsystem". We'll need to output at least:

  • An indicator value to indicate we want to write
  • The address to write to
  • The value to write

The memory subsystem will be placed outside the core, since there are other components that want to "talk" to the memory. So, I'll make a record for these signals, but place it outside of the core folder.

src/constants.vhd ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library ieee;
2
+ use ieee.std_logic_1164.all;
3
+
4
+ use work.types.all;
5
+
6
+
7
+ package constants is
8
+ constant DEFAULT_MEM_REQ: mem_req_t := (
9
+ active => '0',
10
+ address => (others => '0'),
11
+ value => (others => '0')
12
+ );
13
+ end package constants;
src/types.vhd ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library ieee;
2
+ use ieee.std_logic_1164.all;
3
+
4
+
5
+ package types is
6
+ type mem_req_t is record
7
+ active: std_logic;
8
+ address: std_logic_vector(31 downto 0);
9
+ value: std_logic_vector(31 downto 0);
10
+ end record mem_req_t;
11
+ end package types;

Now, we want to make a new module for the memory subsystem.

src/mem_subsys.vhd ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library ieee;
2
+ use ieee.std_logic_1164.all;
3
+ use ieee.numeric_std.all;
4
+
5
+ use work.types.all;
6
+ use work.constants.all;
7
+
8
+
9
+ entity mem_subsys is
10
+ port (
11
+ clk: in std_logic;
12
+ req: in mem_req_t
13
+ );
14
+ end mem_subsys;
15
+
16
+
17
+ architecture rtl of mem_subsys is
18
+ begin
19
+ end rtl;

Now, we want to instantiate the mem_subsys module in the top_level, and route the signals from the execute stage to the memory subsystem, crossing the interface of the core module. So, here we go.

src/core.vhd CHANGED
@@ -2,6 +2,8 @@ library ieee;
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
 
 
5
  use work.core_types.all;
6
  use work.core_constants.all;
7
 
@@ -9,6 +11,7 @@ use work.core_constants.all;
9
  entity core is
10
  port (
11
  clk: in std_logic;
 
12
  led: out std_logic_vector(7 downto 0)
13
  );
14
  end core;
@@ -48,6 +51,7 @@ architecture rtl of core is
48
  clk: in std_logic;
49
  input: in decode_output_t;
50
  output: out execute_output_t;
 
51
  jump: out std_logic := '0';
52
  jump_address: out std_logic_vector(31 downto 0);
53
  led: out std_logic_vector(7 downto 0)
@@ -67,7 +71,7 @@ begin
67
 
68
  decode_write_inst: decode_write port map(clk => clk, decode_input => fetch_output, decode_output => decode_output, write_input => memory_output, pipeline_ready => pipeline_ready);
69
 
70
- execute_inst: execute port map(clk => clk, input => decode_output, output => execute_output, jump => jump, jump_address => jump_address, led => led);
71
 
72
  memory_inst: memory port map(clk => clk, input => execute_output, output => memory_output);
73
 
 
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
5
+ use work.types.all;
6
+
7
  use work.core_types.all;
8
  use work.core_constants.all;
9
 
 
11
  entity core is
12
  port (
13
  clk: in std_logic;
14
+ mem_req: out mem_req_t;
15
  led: out std_logic_vector(7 downto 0)
16
  );
17
  end core;
 
51
  clk: in std_logic;
52
  input: in decode_output_t;
53
  output: out execute_output_t;
54
+ mem_req: out mem_req_t;
55
  jump: out std_logic := '0';
56
  jump_address: out std_logic_vector(31 downto 0);
57
  led: out std_logic_vector(7 downto 0)
 
71
 
72
  decode_write_inst: decode_write port map(clk => clk, decode_input => fetch_output, decode_output => decode_output, write_input => memory_output, pipeline_ready => pipeline_ready);
73
 
74
+ execute_inst: execute port map(clk => clk, input => decode_output, output => execute_output, mem_req => mem_req, jump => jump, jump_address => jump_address, led => led);
75
 
76
  memory_inst: memory port map(clk => clk, input => execute_output, output => memory_output);
77
 
src/core/execute.vhd CHANGED
@@ -2,6 +2,9 @@ library ieee;
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
 
 
 
5
  use work.core_types.all;
6
  use work.core_constants.all;
7
 
@@ -11,6 +14,7 @@ entity execute is
11
  clk: in std_logic;
12
  input: in decode_output_t;
13
  output: out execute_output_t := DEFAULT_EXECUTE_OUTPUT;
 
14
  jump: out std_logic := '0';
15
  jump_address: out std_logic_vector(31 downto 0) := (others => '0');
16
  led: out std_logic_vector(7 downto 0) := (others => '0')
 
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
5
+ use work.types.all;
6
+ use work.constants.all;
7
+
8
  use work.core_types.all;
9
  use work.core_constants.all;
10
 
 
14
  clk: in std_logic;
15
  input: in decode_output_t;
16
  output: out execute_output_t := DEFAULT_EXECUTE_OUTPUT;
17
+ mem_req: out mem_req_t := DEFAULT_MEM_REQ;
18
  jump: out std_logic := '0';
19
  jump_address: out std_logic_vector(31 downto 0) := (others => '0');
20
  led: out std_logic_vector(7 downto 0) := (others => '0')
src/top_level.vhd CHANGED
@@ -2,6 +2,8 @@ library ieee;
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
 
 
5
 
6
  entity top_level is
7
  port (
@@ -12,17 +14,27 @@ end top_level;
12
 
13
 
14
  architecture rtl of top_level is
15
- signal count: unsigned(31 downto 0) := (others => '0');
16
 
17
  component core is
18
  port (
19
  clk: in std_logic;
 
20
  led: out std_logic_vector(7 downto 0)
21
  );
22
  end component;
23
 
 
 
 
 
 
 
 
24
  begin
25
 
26
- core_inst: core port map(clk => clk, led => led);
 
 
27
 
28
  end rtl;
 
2
  use ieee.std_logic_1164.all;
3
  use ieee.numeric_std.all;
4
 
5
+ use work.types.all;
6
+
7
 
8
  entity top_level is
9
  port (
 
14
 
15
 
16
  architecture rtl of top_level is
17
+ signal mem_req: mem_req_t;
18
 
19
  component core is
20
  port (
21
  clk: in std_logic;
22
+ mem_req: out mem_req_t;
23
  led: out std_logic_vector(7 downto 0)
24
  );
25
  end component;
26
 
27
+ component mem_subsys is
28
+ port (
29
+ clk: in std_logic;
30
+ req: in mem_req_t;
31
+ );
32
+ end component;
33
+
34
  begin
35
 
36
+ core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
37
+
38
+ mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req);
39
 
40
  end rtl;

Now implementing OP_SW in the execute stage is simple.

src/core/execute.vhd CHANGED
@@ -30,11 +30,13 @@ begin
30
  variable v_sign: std_logic_vector(31 downto 0);
31
  variable v_jump: std_logic;
32
  variable v_jump_address: std_logic_vector(31 downto 0);
 
33
 
34
  begin
35
  if rising_edge(clk) then
36
  v_output := DEFAULT_EXECUTE_OUTPUT;
37
  v_output.is_active := input.is_active;
 
38
  v_jump := '0';
39
  v_jump_address := (others => '0');
40
 
@@ -138,7 +140,9 @@ begin
138
  v_jump_address := input.operand3;
139
  end if;
140
  elsif input.operation = OP_SW then
141
- -- TODO: implement
 
 
142
  elsif input.operation = OP_LED then
143
  led <= input.operand1(7 downto 0);
144
  else
@@ -148,10 +152,12 @@ begin
148
  v_output.destination_reg := input.destination_reg;
149
  end if;
150
 
 
 
 
 
151
  jump <= v_jump;
152
  jump_address <= v_jump_address(31 downto 1) & "0";
153
-
154
- output <= v_output;
155
  end if;
156
  end process;
157
 
 
30
  variable v_sign: std_logic_vector(31 downto 0);
31
  variable v_jump: std_logic;
32
  variable v_jump_address: std_logic_vector(31 downto 0);
33
+ variable v_mem_req: mem_req_t;
34
 
35
  begin
36
  if rising_edge(clk) then
37
  v_output := DEFAULT_EXECUTE_OUTPUT;
38
  v_output.is_active := input.is_active;
39
+ v_mem_req := DEFAULT_MEM_REQ;
40
  v_jump := '0';
41
  v_jump_address := (others => '0');
42
 
 
140
  v_jump_address := input.operand3;
141
  end if;
142
  elsif input.operation = OP_SW then
143
+ v_mem_req.active := '1';
144
+ v_mem_req.address := input.operand1;
145
+ v_mem_req.value := input.operand2;
146
  elsif input.operation = OP_LED then
147
  led <= input.operand1(7 downto 0);
148
  else
 
152
  v_output.destination_reg := input.destination_reg;
153
  end if;
154
 
155
+ output <= v_output;
156
+
157
+ mem_req <= v_mem_req;
158
+
159
  jump <= v_jump;
160
  jump_address <= v_jump_address(31 downto 1) & "0";
 
 
161
  end if;
162
  end process;
163
 

Now we need to implement the memory subsystem itself. In the spirit of "doing the simplest thing that could work", we can just make a vector of std_logic_vectors like we did for the registers. Let's make it 4KB big, which means it's 1024 words, since words consists of 4 bytes.

src/mem_subsys.vhd CHANGED
@@ -15,5 +15,17 @@ end mem_subsys;
15
 
16
 
17
  architecture rtl of mem_subsys is
 
 
 
18
  begin
 
 
 
 
 
 
 
 
 
19
  end rtl;
 
15
 
16
 
17
  architecture rtl of mem_subsys is
18
+ type ram_t is array (0 to 1023) of std_logic_vector(31 downto 0);
19
+ signal ram: ram_t := (others => (others => '0'));
20
+
21
  begin
22
+
23
+ process (clk)
24
+ begin
25
+ if rising_edge(clk) then
26
+ if req.active = '1' then
27
+ ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
28
+ end if;
29
+ end if;
30
+ end process;
31
  end rtl;

Now, let's write a simple program that increments a counter, and uses the counter as both the address and the value to write. Since the address is in bytes but we're writing words, we'll shift the address to the left by two bits, which makes sure the address is a multiple of 4 so that our stores are aligned.

loop:
sll x2, x1, 2
sw x1, 0(x2)
lw x5, 0(x2)
addi x1, x1, 1
j loop

This assembles to

00209113
00112023
00108093
ff5ff06f
src/core/fetch.vhd CHANGED
@@ -20,7 +20,7 @@ end fetch;
20
  architecture rtl of fetch is
21
  type instruction_memory_t is array(0 to 15) of std_logic_vector(31 downto 0);
22
  signal imem: instruction_memory_t := (
23
- X"00a00193", X"00100113", X"00208133", X"002080b3", X"ffe18193", X"fe304ae3", X"00018463", X"00010093",
24
  X"0000006f", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000"
25
  );
26
 
 
20
  architecture rtl of fetch is
21
  type instruction_memory_t is array(0 to 15) of std_logic_vector(31 downto 0);
22
  signal imem: instruction_memory_t := (
23
+ X"00112023", X"00108093", X"00209113", X"ff5ff06f", X"00000000", X"00000000", X"00000000", X"00000000",
24
  X"0000006f", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000"
25
  );
26
 
src/mem_subsys.vhd CHANGED
@@ -26,6 +26,6 @@ begin
26
  if req.active = '1' then
27
  ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
28
  end if;
29
- end if;
30
  end process;
31
  end rtl;
 
26
  if req.active = '1' then
27
  ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
28
  end if;
29
+ end if;
30
  end process;
31
  end rtl;

And... This looks good! Our memory gets filled, word by word. Simulation waveforms

Now, I want to proceed by implementing the LW (load word) instruction. This is somewhat similar to storing a word, in that the execute stage will signal an address to the memory subsystem, and the memory subsystem will act on it.

However, the memory subsystem needs to know if it has to perform a read or a write command. So let's add a type and field for it.

src/constants.vhd CHANGED
@@ -7,6 +7,7 @@ use work.types.all;
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
 
10
  address => (others => '0'),
11
  value => (others => '0')
12
  );
 
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
10
+ cmd => MEM_CMD_READ,
11
  address => (others => '0'),
12
  value => (others => '0')
13
  );
src/types.vhd CHANGED
@@ -3,8 +3,11 @@ use ieee.std_logic_1164.all;
3
 
4
 
5
  package types is
 
 
6
  type mem_req_t is record
7
  active: std_logic;
 
8
  address: std_logic_vector(31 downto 0);
9
  value: std_logic_vector(31 downto 0);
10
  end record mem_req_t;
 
3
 
4
 
5
  package types is
6
+ type mem_cmd_t is (MEM_CMD_READ, MEM_CMD_WRITE);
7
+
8
  type mem_req_t is record
9
  active: std_logic;
10
+ cmd: mem_cmd_t;
11
  address: std_logic_vector(31 downto 0);
12
  value: std_logic_vector(31 downto 0);
13
  end record mem_req_t;

Now, we still need to set the proper command in the execute stage.

src/core/execute.vhd CHANGED
@@ -141,6 +141,7 @@ begin
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
 
144
  v_mem_req.address := input.operand1;
145
  v_mem_req.value := input.operand2;
146
  elsif input.operation = OP_LED then
 
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
144
+ v_mem_req.cmd := MEM_CMD_WRITE;
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LED then

We are now ready to start implementing LW. First, we add an operation for it.

src/core/types.vhd CHANGED
@@ -22,6 +22,7 @@ package core_types is
22
  OP_BLTU,
23
  OP_BGEU,
24
  OP_SW,
 
25
  OP_LED
26
  );
27
 
 
22
  OP_BLTU,
23
  OP_BGEU,
24
  OP_SW,
25
+ OP_LW,
26
  OP_LED
27
  );
28
 

We are now ready to decode LW instructions. The address computation is the same as for the SW instruction, but this time we need to set the destination register.

src/core/decode_write.vhd CHANGED
@@ -129,12 +129,17 @@ begin
129
  v_decode_output.is_invalid := '1';
130
  end if;
131
  elsif opcode = "0000011" then
 
 
 
 
132
  if funct3 = "000" then
133
  -- TODO: LB
134
  elsif funct3 = "001" then
135
  -- TODO: LH
136
  elsif funct3 = "010" then
137
- -- TODO: LW
 
138
  elsif funct3 = "100" then
139
  -- TODO: LBU
140
  elsif funct3 = "101" then
 
129
  v_decode_output.is_invalid := '1';
130
  end if;
131
  elsif opcode = "0000011" then
132
+ -- load instructions
133
+ v_decode_output.operand1 := std_logic_vector(unsigned(reg(to_integer(unsigned(rs1)))) + unsigned(i_imm_s));
134
+ v_decode_output.destination_reg := rd;
135
+
136
  if funct3 = "000" then
137
  -- TODO: LB
138
  elsif funct3 = "001" then
139
  -- TODO: LH
140
  elsif funct3 = "010" then
141
+ -- LW
142
+ v_decode_output.operation := OP_LW;
143
  elsif funct3 = "100" then
144
  -- TODO: LBU
145
  elsif funct3 = "101" then

Now we can tell the memory subsystem to read from the execute stage.

src/core/execute.vhd CHANGED
@@ -144,6 +144,10 @@ begin
144
  v_mem_req.cmd := MEM_CMD_WRITE;
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
 
 
 
 
147
  elsif input.operation = OP_LED then
148
  led <= input.operand1(7 downto 0);
149
  else
 
144
  v_mem_req.cmd := MEM_CMD_WRITE;
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
+ elsif input.operation = OP_LW then
148
+ v_mem_req.active := '1';
149
+ v_mem_req.cmd := MEM_CMD_READ;
150
+ v_mem_req.address := input.operand1;
151
  elsif input.operation = OP_LED then
152
  led <= input.operand1(7 downto 0);
153
  else

We still need to implement reading in the memory subsystem. I'll add an output named res (for "response").

src/mem_subsys.vhd CHANGED
@@ -9,7 +9,8 @@ use work.constants.all;
9
  entity mem_subsys is
10
  port (
11
  clk: in std_logic;
12
- req: in mem_req_t
 
13
  );
14
  end mem_subsys;
15
 
@@ -24,7 +25,13 @@ begin
24
  begin
25
  if rising_edge(clk) then
26
  if req.active = '1' then
27
- ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
 
 
 
 
 
 
28
  end if;
29
  end if;
30
  end process;
 
9
  entity mem_subsys is
10
  port (
11
  clk: in std_logic;
12
+ req: in mem_req_t;
13
+ res: out std_logic_vector(31 downto 0)
14
  );
15
  end mem_subsys;
16
 
 
25
  begin
26
  if rising_edge(clk) then
27
  if req.active = '1' then
28
+ if req.cmd = MEM_CMD_WRITE then
29
+ ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
30
+ else
31
+ res <= ram(to_integer(unsigned(req.address(11 downto 2))));
32
+ end if;
33
+ else
34
+ res <= (others => '0');
35
  end if;
36
  end if;
37
  end process;
src/top_level.vhd CHANGED
@@ -15,6 +15,7 @@ end top_level;
15
 
16
  architecture rtl of top_level is
17
  signal mem_req: mem_req_t;
 
18
 
19
  component core is
20
  port (
@@ -28,6 +29,7 @@ architecture rtl of top_level is
28
  port (
29
  clk: in std_logic;
30
  req: in mem_req_t;
 
31
  );
32
  end component;
33
 
@@ -35,6 +37,6 @@ begin
35
 
36
  core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
37
 
38
- mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req);
39
 
40
  end rtl;
 
15
 
16
  architecture rtl of top_level is
17
  signal mem_req: mem_req_t;
18
+ signal mem_res: std_logic_vector(31 downto 0);
19
 
20
  component core is
21
  port (
 
29
  port (
30
  clk: in std_logic;
31
  req: in mem_req_t;
32
+ res: out std_logic_vector(31 downto 0)
33
  );
34
  end component;
35
 
 
37
 
38
  core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
39
 
40
+ mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req, res => mem_res);
41
 
42
  end rtl;

This output needs to be routed back to the core.

src/mem_subsys.vhd CHANGED
@@ -9,7 +9,8 @@ use work.constants.all;
9
  entity mem_subsys is
10
  port (
11
  clk: in std_logic;
12
- req: in mem_req_t
 
13
  );
14
  end mem_subsys;
15
 
@@ -24,7 +25,13 @@ begin
24
  begin
25
  if rising_edge(clk) then
26
  if req.active = '1' then
27
- ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
 
 
 
 
 
 
28
  end if;
29
  end if;
30
  end process;
 
9
  entity mem_subsys is
10
  port (
11
  clk: in std_logic;
12
+ req: in mem_req_t;
13
+ res: out std_logic_vector(31 downto 0)
14
  );
15
  end mem_subsys;
16
 
 
25
  begin
26
  if rising_edge(clk) then
27
  if req.active = '1' then
28
+ if req.cmd = MEM_CMD_WRITE then
29
+ ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
30
+ else
31
+ res <= ram(to_integer(unsigned(req.address(11 downto 2))));
32
+ end if;
33
+ else
34
+ res <= (others => '0');
35
  end if;
36
  end if;
37
  end process;
src/top_level.vhd CHANGED
@@ -15,6 +15,7 @@ end top_level;
15
 
16
  architecture rtl of top_level is
17
  signal mem_req: mem_req_t;
 
18
 
19
  component core is
20
  port (
@@ -28,6 +29,7 @@ architecture rtl of top_level is
28
  port (
29
  clk: in std_logic;
30
  req: in mem_req_t;
 
31
  );
32
  end component;
33
 
@@ -35,6 +37,6 @@ begin
35
 
36
  core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
37
 
38
- mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req);
39
 
40
  end rtl;
 
15
 
16
  architecture rtl of top_level is
17
  signal mem_req: mem_req_t;
18
+ signal mem_res: std_logic_vector(31 downto 0);
19
 
20
  component core is
21
  port (
 
29
  port (
30
  clk: in std_logic;
31
  req: in mem_req_t;
32
+ res: out std_logic_vector(31 downto 0)
33
  );
34
  end component;
35
 
 
37
 
38
  core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
39
 
40
+ mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req, res => mem_res);
41
 
42
  end rtl;

Now, we want to route it back to some stage. When the execute stage writes its output, the memory stage is running (for one cycle). At the same time, the memory subsystem is also doing the read. So, the output from the read will not arrive in time for the memory stage; we can only use it in the writeback stage. So, we are not doing anything in the memory stage, except just adding a single-cycle delay to make sure the value that is read from the memory arrives in time for the writeback stage.

src/core.vhd CHANGED
@@ -12,6 +12,7 @@ entity core is
12
  port (
13
  clk: in std_logic;
14
  mem_req: out mem_req_t;
 
15
  led: out std_logic_vector(7 downto 0)
16
  );
17
  end core;
@@ -42,6 +43,7 @@ architecture rtl of core is
42
  decode_input: in fetch_output_t;
43
  decode_output: out decode_output_t;
44
  write_input: in memory_output_t;
 
45
  pipeline_ready: out std_logic
46
  );
47
  end component;
@@ -69,7 +71,7 @@ architecture rtl of core is
69
  begin
70
  fetch_inst: fetch port map(clk => clk, pipeline_ready => pipeline_ready, jump => jump, jump_address => jump_address, output => fetch_output);
71
 
72
- decode_write_inst: decode_write port map(clk => clk, decode_input => fetch_output, decode_output => decode_output, write_input => memory_output, pipeline_ready => pipeline_ready);
73
 
74
  execute_inst: execute port map(clk => clk, input => decode_output, output => execute_output, mem_req => mem_req, jump => jump, jump_address => jump_address, led => led);
75
 
 
12
  port (
13
  clk: in std_logic;
14
  mem_req: out mem_req_t;
15
+ mem_res: in std_logic_vector(31 downto 0);
16
  led: out std_logic_vector(7 downto 0)
17
  );
18
  end core;
 
43
  decode_input: in fetch_output_t;
44
  decode_output: out decode_output_t;
45
  write_input: in memory_output_t;
46
+ mem_res: in std_logic_vector(31 downto 0);
47
  pipeline_ready: out std_logic
48
  );
49
  end component;
 
71
  begin
72
  fetch_inst: fetch port map(clk => clk, pipeline_ready => pipeline_ready, jump => jump, jump_address => jump_address, output => fetch_output);
73
 
74
+ decode_write_inst: decode_write port map(clk => clk, decode_input => fetch_output, decode_output => decode_output, write_input => memory_output, mem_res => mem_res, pipeline_ready => pipeline_ready);
75
 
76
  execute_inst: execute port map(clk => clk, input => decode_output, output => execute_output, mem_req => mem_req, jump => jump, jump_address => jump_address, led => led);
77
 
src/core/decode_write.vhd CHANGED
@@ -14,6 +14,7 @@ entity decode_write is
14
  decode_output: out decode_output_t := DEFAULT_DECODE_OUTPUT;
15
 
16
  write_input: in memory_output_t;
 
17
  pipeline_ready: out std_logic := '1'
18
  );
19
  end decode_write;
 
14
  decode_output: out decode_output_t := DEFAULT_DECODE_OUTPUT;
15
 
16
  write_input: in memory_output_t;
17
+ mem_res: in std_logic_vector(31 downto 0);
18
  pipeline_ready: out std_logic := '1'
19
  );
20
  end decode_write;
src/top_level.vhd CHANGED
@@ -21,6 +21,7 @@ architecture rtl of top_level is
21
  port (
22
  clk: in std_logic;
23
  mem_req: out mem_req_t;
 
24
  led: out std_logic_vector(7 downto 0)
25
  );
26
  end component;
@@ -35,7 +36,7 @@ architecture rtl of top_level is
35
 
36
  begin
37
 
38
- core_inst: core port map(clk => clk, mem_req => mem_req, led => led);
39
 
40
  mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req, res => mem_res);
41
 
 
21
  port (
22
  clk: in std_logic;
23
  mem_req: out mem_req_t;
24
+ mem_res: in std_logic_vector(31 downto 0);
25
  led: out std_logic_vector(7 downto 0)
26
  );
27
  end component;
 
36
 
37
  begin
38
 
39
+ core_inst: core port map(clk => clk, mem_req => mem_req, mem_res => mem_res, led => led);
40
 
41
  mem_subsys_inst: mem_subsys port map(clk => clk, req => mem_req, res => mem_res);
42
 

Now, as a last step, the execute stage needs to tell the writeback stage that it has to store the response from the memory in the destination register, instead of the result output from the execute stage. For this, I add a use_mem flag to the output of the execute stage. It needs to be routed through the memory stage, so I'll add it to the output of the memory stage as well.

src/core/constants.vhd CHANGED
@@ -23,12 +23,14 @@ package core_constants is
23
 
24
  constant DEFAULT_EXECUTE_OUTPUT: execute_output_t := (
25
  is_active => '0',
 
26
  result => (others => '0'),
27
  destination_reg => (others => '0')
28
  );
29
 
30
  constant DEFAULT_MEMORY_OUTPUT: memory_output_t := (
31
  is_active => '0',
 
32
  result => (others => '0'),
33
  destination_reg => (others => '0')
34
  );
 
23
 
24
  constant DEFAULT_EXECUTE_OUTPUT: execute_output_t := (
25
  is_active => '0',
26
+ use_mem => '0',
27
  result => (others => '0'),
28
  destination_reg => (others => '0')
29
  );
30
 
31
  constant DEFAULT_MEMORY_OUTPUT: memory_output_t := (
32
  is_active => '0',
33
+ use_mem => '0',
34
  result => (others => '0'),
35
  destination_reg => (others => '0')
36
  );
src/core/types.vhd CHANGED
@@ -44,12 +44,14 @@ package core_types is
44
 
45
  type execute_output_t is record
46
  is_active: std_logic;
 
47
  result: std_logic_vector(31 downto 0);
48
  destination_reg: std_logic_vector(4 downto 0);
49
  end record execute_output_t;
50
 
51
  type memory_output_t is record
52
  is_active: std_logic;
 
53
  result: std_logic_vector(31 downto 0);
54
  destination_reg: std_logic_vector(4 downto 0);
55
  end record memory_output_t;
 
44
 
45
  type execute_output_t is record
46
  is_active: std_logic;
47
+ use_mem: std_logic;
48
  result: std_logic_vector(31 downto 0);
49
  destination_reg: std_logic_vector(4 downto 0);
50
  end record execute_output_t;
51
 
52
  type memory_output_t is record
53
  is_active: std_logic;
54
+ use_mem: std_logic;
55
  result: std_logic_vector(31 downto 0);
56
  destination_reg: std_logic_vector(4 downto 0);
57
  end record memory_output_t;

Now, we need to set this flag in the execute stage whenever we perform a read.

src/core/execute.vhd CHANGED
@@ -145,6 +145,7 @@ begin
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
 
148
  v_mem_req.active := '1';
149
  v_mem_req.cmd := MEM_CMD_READ;
150
  v_mem_req.address := input.operand1;
 
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
148
+ v_output.use_mem := '1';
149
  v_mem_req.active := '1';
150
  v_mem_req.cmd := MEM_CMD_READ;
151
  v_mem_req.address := input.operand1;

Finally, we need to update the writeback stage to actually write back the memory response when the use_mem flag is set.

src/core/decode_write.vhd CHANGED
@@ -47,7 +47,11 @@ begin
47
  if rising_edge(clk) then
48
  -- write back result if the destination register is not x0 (which always stays 0)
49
  if write_input.destination_reg /= "00000" then
50
- reg(to_integer(unsigned(write_input.destination_reg))) <= write_input.result;
 
 
 
 
51
  end if;
52
 
53
  pipeline_ready <= write_input.is_active;
 
47
  if rising_edge(clk) then
48
  -- write back result if the destination register is not x0 (which always stays 0)
49
  if write_input.destination_reg /= "00000" then
50
+ if write_input.use_mem = '1' then
51
+ reg(to_integer(unsigned(write_input.destination_reg))) <= mem_res;
52
+ else
53
+ reg(to_integer(unsigned(write_input.destination_reg))) <= write_input.result;
54
+ end if;
55
  end if;
56
 
57
  pipeline_ready <= write_input.is_active;

That's it, I guess? We can adapt our program from before by adding a load of the same address immediately after the store.

loop:
sll x2, x1, 2
sw x1, 0(x2)
lw x5, 0(x2)
addi x1, x1, 1
j loop

This assembles to

00209113
00112023
00012283
00108093
ff1ff06f

So we'll put this in the instruction memory.

src/core/fetch.vhd CHANGED
@@ -20,7 +20,7 @@ end fetch;
20
  architecture rtl of fetch is
21
  type instruction_memory_t is array(0 to 15) of std_logic_vector(31 downto 0);
22
  signal imem: instruction_memory_t := (
23
- X"00112023", X"00108093", X"00209113", X"ff5ff06f", X"00000000", X"00000000", X"00000000", X"00000000",
24
  X"0000006f", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000"
25
  );
26
 
 
20
  architecture rtl of fetch is
21
  type instruction_memory_t is array(0 to 15) of std_logic_vector(31 downto 0);
22
  signal imem: instruction_memory_t := (
23
+ X"00209113", X"00112023", X"00012283", X"00108093", X"ff1ff06f", X"00000000", X"00000000", X"00000000",
24
  X"0000006f", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000", X"00000000"
25
  );
26
 

When we simulate this... It doesn't work?

After tracing the signals, it becomes obvious we forgot to pass the use_mem flag in the memory stage. We can just update it to also copy this flag:

src/core/memory.vhd CHANGED
@@ -22,6 +22,7 @@ begin
22
  begin
23
  if rising_edge(clk) then
24
  output.is_active <= input.is_active;
 
25
  output.result <= input.result;
26
  output.destination_reg <= input.destination_reg;
27
  end if;
 
22
  begin
23
  if rising_edge(clk) then
24
  output.is_active <= input.is_active;
25
+ output.use_mem <= input.use_mem;
26
  output.result <= input.result;
27
  output.destination_reg <= input.destination_reg;
28
  end if;

Actually, since the memory stage does nothing, we can just remove the memory_output_t, since it is exactly the same as execute_output_t. So let's do a bit of cleanup and remove the memory_output_t and associated constants, and replace it by execute_output_t whenever it's used.

cpu.xpr CHANGED
@@ -7,7 +7,7 @@
7
  <Project Product="Vivado" Version="7" Minor="70" Path="/home/ruben/projects/cpucourse2/cpu/cpu.xpr">
8
  <DefaultLaunch Dir="$PRUNDIR"/>
9
  <Configuration>
10
- <Option Name="Id" Val="043dee4e333b4e5db3f851a5c4563b0f"/>
11
  <Option Name="Part" Val="xc7a50tfgg484-1"/>
12
  <Option Name="CompiledLibDir" Val="$PCACHEDIR/compile_simlib"/>
13
  <Option Name="CompiledLibDirXSim" Val=""/>
@@ -58,7 +58,7 @@
58
  <Option Name="IPUserFilesDir" Val="$PIPUSERFILESDIR"/>
59
  <Option Name="IPStaticSourceDir" Val="$PIPUSERFILESDIR/ipstatic"/>
60
  <Option Name="EnableBDX" Val="FALSE"/>
61
- <Option Name="WTXSimLaunchSim" Val="2"/>
62
  <Option Name="WTModelSimLaunchSim" Val="0"/>
63
  <Option Name="WTQuestaLaunchSim" Val="0"/>
64
  <Option Name="WTIesLaunchSim" Val="0"/>
@@ -89,55 +89,73 @@
89
  <FileSets Version="1" Minor="32">
90
  <FileSet Name="sources_1" Type="DesignSrcs" RelSrcDir="$PSRCDIR/sources_1" RelGenDir="$PGENDIR/sources_1">
91
  <Filter Type="Srcs"/>
92
- <File Path="$PPRDIR/src/top_level.vhd">
93
  <FileInfo>
94
  <Attr Name="UsedIn" Val="synthesis"/>
95
  <Attr Name="UsedIn" Val="simulation"/>
96
  </FileInfo>
97
  </File>
98
- <File Path="$PPRDIR/src/core/constants.vhd">
99
  <FileInfo>
100
- <Attr Name="AutoDisabled" Val="1"/>
101
  <Attr Name="UsedIn" Val="synthesis"/>
102
  <Attr Name="UsedIn" Val="simulation"/>
103
  </FileInfo>
104
  </File>
105
- <File Path="$PPRDIR/src/core/execute.vhd">
106
  <FileInfo>
107
- <Attr Name="AutoDisabled" Val="1"/>
108
  <Attr Name="UsedIn" Val="synthesis"/>
109
  <Attr Name="UsedIn" Val="simulation"/>
110
  </FileInfo>
111
  </File>
112
- <File Path="$PPRDIR/src/core/memory.vhd">
113
  <FileInfo>
114
- <Attr Name="AutoDisabled" Val="1"/>
115
  <Attr Name="UsedIn" Val="synthesis"/>
116
  <Attr Name="UsedIn" Val="simulation"/>
117
  </FileInfo>
118
  </File>
119
- <File Path="$PPRDIR/src/core/types.vhd">
120
  <FileInfo>
121
- <Attr Name="AutoDisabled" Val="1"/>
122
  <Attr Name="UsedIn" Val="synthesis"/>
123
  <Attr Name="UsedIn" Val="simulation"/>
124
  </FileInfo>
125
  </File>
126
  <File Path="$PPRDIR/src/core/decode_write.vhd">
127
  <FileInfo>
128
- <Attr Name="AutoDisabled" Val="1"/>
 
 
 
 
 
129
  <Attr Name="UsedIn" Val="synthesis"/>
130
  <Attr Name="UsedIn" Val="simulation"/>
131
  </FileInfo>
132
  </File>
133
  <File Path="$PPRDIR/src/core/fetch.vhd">
134
  <FileInfo>
135
- <Attr Name="AutoDisabled" Val="1"/>
136
  <Attr Name="UsedIn" Val="synthesis"/>
137
  <Attr Name="UsedIn" Val="simulation"/>
138
  </FileInfo>
139
  </File>
140
- <File Path="$PPRDIR/src/core.vhd">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  <FileInfo>
142
  <Attr Name="AutoDisabled" Val="1"/>
143
  <Attr Name="UsedIn" Val="synthesis"/>
@@ -163,13 +181,13 @@
163
  </FileSet>
164
  <FileSet Name="sim_1" Type="SimulationSrcs" RelSrcDir="$PSRCDIR/sim_1" RelGenDir="$PGENDIR/sim_1">
165
  <Filter Type="Srcs"/>
166
- <File Path="$PPRDIR/sim/core_tb.vhd">
167
  <FileInfo>
168
  <Attr Name="UsedIn" Val="synthesis"/>
169
  <Attr Name="UsedIn" Val="simulation"/>
170
  </FileInfo>
171
  </File>
172
- <File Path="$PPRDIR/sim/top_level_tb.vhd">
173
  <FileInfo>
174
  <Attr Name="AutoDisabled" Val="1"/>
175
  <Attr Name="UsedIn" Val="synthesis"/>
@@ -178,9 +196,8 @@
178
  </File>
179
  <Config>
180
  <Option Name="DesignMode" Val="RTL"/>
181
- <Option Name="TopModule" Val="core_tb"/>
182
  <Option Name="TopLib" Val="xil_defaultlib"/>
183
- <Option Name="TopAutoSet" Val="TRUE"/>
184
  <Option Name="TransportPathDelay" Val="0"/>
185
  <Option Name="TransportIntDelay" Val="0"/>
186
  <Option Name="SelectedSimModel" Val="rtl"/>
@@ -224,11 +241,12 @@
224
  </Simulator>
225
  </Simulators>
226
  <Runs Version="1" Minor="22">
227
- <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a50tfgg484-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" WriteIncrSynthDcp="false" State="current" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1" ParallelReportGen="true">
228
  <Strategy Version="1" Minor="2">
229
  <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2025"/>
230
  <Step Id="synth_design"/>
231
  </Strategy>
 
232
  <ReportStrategy Name="Vivado Synthesis Default Reports" Flow="Vivado Synthesis 2025"/>
233
  <Report Name="ROUTE_DESIGN.REPORT_METHODOLOGY" Enabled="1"/>
234
  <RQSFiles/>
 
7
  <Project Product="Vivado" Version="7" Minor="70" Path="/home/ruben/projects/cpucourse2/cpu/cpu.xpr">
8
  <DefaultLaunch Dir="$PRUNDIR"/>
9
  <Configuration>
10
+ <Option Name="Id" Val="4a9cfec0f8464be581feae96340e3ce2"/>
11
  <Option Name="Part" Val="xc7a50tfgg484-1"/>
12
  <Option Name="CompiledLibDir" Val="$PCACHEDIR/compile_simlib"/>
13
  <Option Name="CompiledLibDirXSim" Val=""/>
 
58
  <Option Name="IPUserFilesDir" Val="$PIPUSERFILESDIR"/>
59
  <Option Name="IPStaticSourceDir" Val="$PIPUSERFILESDIR/ipstatic"/>
60
  <Option Name="EnableBDX" Val="FALSE"/>
61
+ <Option Name="WTXSimLaunchSim" Val="5"/>
62
  <Option Name="WTModelSimLaunchSim" Val="0"/>
63
  <Option Name="WTQuestaLaunchSim" Val="0"/>
64
  <Option Name="WTIesLaunchSim" Val="0"/>
 
89
  <FileSets Version="1" Minor="32">
90
  <FileSet Name="sources_1" Type="DesignSrcs" RelSrcDir="$PSRCDIR/sources_1" RelGenDir="$PGENDIR/sources_1">
91
  <Filter Type="Srcs"/>
92
+ <File Path="$PPRDIR/src/types.vhd">
93
  <FileInfo>
94
  <Attr Name="UsedIn" Val="synthesis"/>
95
  <Attr Name="UsedIn" Val="simulation"/>
96
  </FileInfo>
97
  </File>
98
+ <File Path="$PPRDIR/src/constants.vhd">
99
  <FileInfo>
 
100
  <Attr Name="UsedIn" Val="synthesis"/>
101
  <Attr Name="UsedIn" Val="simulation"/>
102
  </FileInfo>
103
  </File>
104
+ <File Path="$PPRDIR/src/core/types.vhd">
105
  <FileInfo>
 
106
  <Attr Name="UsedIn" Val="synthesis"/>
107
  <Attr Name="UsedIn" Val="simulation"/>
108
  </FileInfo>
109
  </File>
110
+ <File Path="$PPRDIR/src/core/constants.vhd">
111
  <FileInfo>
 
112
  <Attr Name="UsedIn" Val="synthesis"/>
113
  <Attr Name="UsedIn" Val="simulation"/>
114
  </FileInfo>
115
  </File>
116
+ <File Path="$PPRDIR/src/core.vhd">
117
  <FileInfo>
 
118
  <Attr Name="UsedIn" Val="synthesis"/>
119
  <Attr Name="UsedIn" Val="simulation"/>
120
  </FileInfo>
121
  </File>
122
  <File Path="$PPRDIR/src/core/decode_write.vhd">
123
  <FileInfo>
124
+ <Attr Name="UsedIn" Val="synthesis"/>
125
+ <Attr Name="UsedIn" Val="simulation"/>
126
+ </FileInfo>
127
+ </File>
128
+ <File Path="$PPRDIR/src/core/execute.vhd">
129
+ <FileInfo>
130
  <Attr Name="UsedIn" Val="synthesis"/>
131
  <Attr Name="UsedIn" Val="simulation"/>
132
  </FileInfo>
133
  </File>
134
  <File Path="$PPRDIR/src/core/fetch.vhd">
135
  <FileInfo>
 
136
  <Attr Name="UsedIn" Val="synthesis"/>
137
  <Attr Name="UsedIn" Val="simulation"/>
138
  </FileInfo>
139
  </File>
140
+ <File Path="$PPRDIR/src/mem_subsys.vhd">
141
+ <FileInfo>
142
+ <Attr Name="UsedIn" Val="synthesis"/>
143
+ <Attr Name="UsedIn" Val="simulation"/>
144
+ </FileInfo>
145
+ </File>
146
+ <File Path="$PPRDIR/src/core/memory.vhd">
147
+ <FileInfo>
148
+ <Attr Name="UsedIn" Val="synthesis"/>
149
+ <Attr Name="UsedIn" Val="simulation"/>
150
+ </FileInfo>
151
+ </File>
152
+ <File Path="$PPRDIR/src/top_level.vhd">
153
+ <FileInfo>
154
+ <Attr Name="UsedIn" Val="synthesis"/>
155
+ <Attr Name="UsedIn" Val="simulation"/>
156
+ </FileInfo>
157
+ </File>
158
+ <File Path="$PPRDIR/src/bram.vhd">
159
  <FileInfo>
160
  <Attr Name="AutoDisabled" Val="1"/>
161
  <Attr Name="UsedIn" Val="synthesis"/>
 
181
  </FileSet>
182
  <FileSet Name="sim_1" Type="SimulationSrcs" RelSrcDir="$PSRCDIR/sim_1" RelGenDir="$PGENDIR/sim_1">
183
  <Filter Type="Srcs"/>
184
+ <File Path="$PPRDIR/sim/top_level_tb.vhd">
185
  <FileInfo>
186
  <Attr Name="UsedIn" Val="synthesis"/>
187
  <Attr Name="UsedIn" Val="simulation"/>
188
  </FileInfo>
189
  </File>
190
+ <File Path="$PPRDIR/sim/core_tb.vhd">
191
  <FileInfo>
192
  <Attr Name="AutoDisabled" Val="1"/>
193
  <Attr Name="UsedIn" Val="synthesis"/>
 
196
  </File>
197
  <Config>
198
  <Option Name="DesignMode" Val="RTL"/>
199
+ <Option Name="TopModule" Val="top_level_tb"/>
200
  <Option Name="TopLib" Val="xil_defaultlib"/>
 
201
  <Option Name="TransportPathDelay" Val="0"/>
202
  <Option Name="TransportIntDelay" Val="0"/>
203
  <Option Name="SelectedSimModel" Val="rtl"/>
 
241
  </Simulator>
242
  </Simulators>
243
  <Runs Version="1" Minor="22">
244
+ <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a50tfgg484-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1" ParallelReportGen="true">
245
  <Strategy Version="1" Minor="2">
246
  <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2025"/>
247
  <Step Id="synth_design"/>
248
  </Strategy>
249
+ <GeneratedRun Dir="$PRUNDIR" File="gen_run.xml"/>
250
  <ReportStrategy Name="Vivado Synthesis Default Reports" Flow="Vivado Synthesis 2025"/>
251
  <Report Name="ROUTE_DESIGN.REPORT_METHODOLOGY" Enabled="1"/>
252
  <RQSFiles/>
src/core.vhd CHANGED
@@ -22,7 +22,7 @@ architecture rtl of core is
22
  signal fetch_output: fetch_output_t;
23
  signal decode_output: decode_output_t;
24
  signal execute_output: execute_output_t;
25
- signal memory_output: memory_output_t;
26
  signal pipeline_ready: std_logic;
27
  signal jump: std_logic;
28
  signal jump_address: std_logic_vector(31 downto 0);
@@ -42,7 +42,7 @@ architecture rtl of core is
42
  clk: in std_logic;
43
  decode_input: in fetch_output_t;
44
  decode_output: out decode_output_t;
45
- write_input: in memory_output_t;
46
  mem_res: in std_logic_vector(31 downto 0);
47
  pipeline_ready: out std_logic
48
  );
@@ -64,7 +64,7 @@ architecture rtl of core is
64
  port (
65
  clk: in std_logic;
66
  input: in execute_output_t;
67
- output: out memory_output_t
68
  );
69
  end component;
70
 
 
22
  signal fetch_output: fetch_output_t;
23
  signal decode_output: decode_output_t;
24
  signal execute_output: execute_output_t;
25
+ signal memory_output: execute_output_t;
26
  signal pipeline_ready: std_logic;
27
  signal jump: std_logic;
28
  signal jump_address: std_logic_vector(31 downto 0);
 
42
  clk: in std_logic;
43
  decode_input: in fetch_output_t;
44
  decode_output: out decode_output_t;
45
+ write_input: in execute_output_t;
46
  mem_res: in std_logic_vector(31 downto 0);
47
  pipeline_ready: out std_logic
48
  );
 
64
  port (
65
  clk: in std_logic;
66
  input: in execute_output_t;
67
+ output: out execute_output_t
68
  );
69
  end component;
70
 
src/core/constants.vhd CHANGED
@@ -27,11 +27,4 @@ package core_constants is
27
  result => (others => '0'),
28
  destination_reg => (others => '0')
29
  );
30
-
31
- constant DEFAULT_MEMORY_OUTPUT: memory_output_t := (
32
- is_active => '0',
33
- use_mem => '0',
34
- result => (others => '0'),
35
- destination_reg => (others => '0')
36
- );
37
  end package core_constants;
 
27
  result => (others => '0'),
28
  destination_reg => (others => '0')
29
  );
 
 
 
 
 
 
 
30
  end package core_constants;
src/core/decode_write.vhd CHANGED
@@ -13,7 +13,7 @@ entity decode_write is
13
  decode_input: in fetch_output_t;
14
  decode_output: out decode_output_t := DEFAULT_DECODE_OUTPUT;
15
 
16
- write_input: in memory_output_t;
17
  mem_res: in std_logic_vector(31 downto 0);
18
  pipeline_ready: out std_logic := '1'
19
  );
 
13
  decode_input: in fetch_output_t;
14
  decode_output: out decode_output_t := DEFAULT_DECODE_OUTPUT;
15
 
16
+ write_input: in execute_output_t;
17
  mem_res: in std_logic_vector(31 downto 0);
18
  pipeline_ready: out std_logic := '1'
19
  );
src/core/memory.vhd CHANGED
@@ -10,7 +10,7 @@ entity memory is
10
  port (
11
  clk: in std_logic;
12
  input: in execute_output_t;
13
- output: out memory_output_t := DEFAULT_MEMORY_OUTPUT
14
  );
15
  end memory;
16
 
 
10
  port (
11
  clk: in std_logic;
12
  input: in execute_output_t;
13
+ output: out execute_output_t := DEFAULT_EXECUTE_OUTPUT
14
  );
15
  end memory;
16
 
src/core/types.vhd CHANGED
@@ -48,11 +48,4 @@ package core_types is
48
  result: std_logic_vector(31 downto 0);
49
  destination_reg: std_logic_vector(4 downto 0);
50
  end record execute_output_t;
51
-
52
- type memory_output_t is record
53
- is_active: std_logic;
54
- use_mem: std_logic;
55
- result: std_logic_vector(31 downto 0);
56
- destination_reg: std_logic_vector(4 downto 0);
57
- end record memory_output_t;
58
  end package core_types;
 
48
  result: std_logic_vector(31 downto 0);
49
  destination_reg: std_logic_vector(4 downto 0);
50
  end record execute_output_t;
 
 
 
 
 
 
 
51
  end package core_types;

The memory stage can now be simplified.

src/core/memory.vhd CHANGED
@@ -21,10 +21,7 @@ begin
21
  process (clk)
22
  begin
23
  if rising_edge(clk) then
24
- output.is_active <= input.is_active;
25
- output.use_mem <= input.use_mem;
26
- output.result <= input.result;
27
- output.destination_reg <= input.destination_reg;
28
  end if;
29
  end process;
30
 
 
21
  process (clk)
22
  begin
23
  if rising_edge(clk) then
24
+ output <= input;
 
 
 
25
  end if;
26
  end process;
27
 

We now want to simulate this. From now on, we'll always want to use top_level_tb.vhd, because just the core is not enough. We might as well delete it to avoid confusion.

sim/core_tb.vhd DELETED
@@ -1,31 +0,0 @@
1
- library ieee;
2
- use ieee.std_logic_1164.all;
3
- use ieee.numeric_std.all;
4
-
5
-
6
- entity core_tb is
7
- end core_tb;
8
-
9
-
10
- architecture behavioral of core_tb is
11
- constant clk_period: time := 10 ns;
12
- signal clk: std_logic := '1';
13
-
14
- component core is
15
- port (
16
- clk: in std_logic
17
- );
18
- end component;
19
-
20
- begin
21
- clk_process :process
22
- begin
23
- clk <= '1';
24
- wait for clk_period / 2;
25
- clk <= '0';
26
- wait for clk_period / 2;
27
- end process;
28
-
29
- core_inst: core port map(clk => clk);
30
-
31
- end behavioral;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

If we now simulate for 500ns and watch the x5 register, we can see the successive values getting loaded.

Simulation waveforms

Next, we're going to implement byte and halfword reads, which require us to write only some of the bytes, instead of always the whole 32-bit word.

To support this, I am going to copy and edit some code from AMD's docs, that is supposed to infer a block RAM. This code supports a "write enable" input, which I want to use.

src/bram.vhd ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library ieee;
2
+ use ieee.std_logic_1164.all;
3
+ use ieee.std_logic_unsigned.all;
4
+
5
+ entity bram is
6
+ generic(
7
+ SIZE: integer := 1024;
8
+ ADDR_WIDTH: integer := 10;
9
+ COL_WIDTH: integer := 8;
10
+ NB_COL: integer := 4
11
+ );
12
+ port(
13
+ clka: in std_logic;
14
+ ena: in std_logic;
15
+ wea: in std_logic_vector(NB_COL - 1 downto 0);
16
+ addra: in std_logic_vector(ADDR_WIDTH - 1 downto 0);
17
+ dia: in std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0);
18
+ doa: out std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0)
19
+ -- clkb: in std_logic;
20
+ -- enb: in std_logic;
21
+ -- web: in std_logic_vector(NB_COL - 1 downto 0);
22
+ -- addrb: in std_logic_vector(ADDR_WIDTH - 1 downto 0);
23
+ -- dib: in std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0);
24
+ -- dob: out std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0)
25
+ );
26
+ end bram;
27
+
28
+ architecture rtl of bram is
29
+ type ram_type is array (0 to SIZE - 1) of std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0);
30
+ -- shared variable RAM: ram_type := (others => (others => '0'));
31
+ signal RAM: ram_type := (others => (others => '0'));
32
+
33
+ begin
34
+
35
+ -- port A
36
+ process(clka)
37
+ begin
38
+ if rising_edge(clka) then
39
+ if ena = '1' then
40
+ for i in 0 to NB_COL - 1 loop
41
+ if wea(i) = '1' then
42
+ RAM(conv_integer(addra))((i + 1) * COL_WIDTH - 1 downto i * COL_WIDTH) <= dia((i + 1) * COL_WIDTH - 1 downto i * COL_WIDTH);
43
+ end if;
44
+ end loop;
45
+ doa <= RAM(conv_integer(addra));
46
+ end if;
47
+ end if;
48
+ end process;
49
+
50
+ -- port B
51
+ -- process(clkb)
52
+ -- begin
53
+ -- if rising_edge(clkb) then
54
+ -- if enb = '1' then
55
+ -- for i in 0 to NB_COL - 1 loop
56
+ -- if web(i) = '1' then
57
+ -- RAM(conv_integer(addrb))((i + 1) * COL_WIDTH - 1 downto i * COL_WIDTH) := dib((i + 1) * COL_WIDTH - 1 downto i * COL_WIDTH);
58
+ -- end if;
59
+ -- end loop;
60
+ -- dob <= RAM(conv_integer(addrb));
61
+ -- end if;
62
+ -- end if;
63
+ -- end process;
64
+ end rtl;

Now, we'll hook up the mem_subsys code to use this bram.

src/constants.vhd CHANGED
@@ -7,7 +7,7 @@ use work.types.all;
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
10
- cmd => MEM_CMD_READ,
11
  address => (others => '0'),
12
  value => (others => '0')
13
  );
 
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
10
+ write => '0',
11
  address => (others => '0'),
12
  value => (others => '0')
13
  );
src/core/execute.vhd CHANGED
@@ -141,13 +141,13 @@ begin
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
144
- v_mem_req.cmd := MEM_CMD_WRITE;
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
148
  v_output.use_mem := '1';
149
  v_mem_req.active := '1';
150
- v_mem_req.cmd := MEM_CMD_READ;
151
  v_mem_req.address := input.operand1;
152
  elsif input.operation = OP_LED then
153
  led <= input.operand1(7 downto 0);
 
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
144
+ v_mem_req.write := '1';
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
148
  v_output.use_mem := '1';
149
  v_mem_req.active := '1';
150
+ v_mem_req.write := '0';
151
  v_mem_req.address := input.operand1;
152
  elsif input.operation = OP_LED then
153
  led <= input.operand1(7 downto 0);
src/mem_subsys.vhd CHANGED
@@ -16,23 +16,24 @@ end mem_subsys;
16
 
17
 
18
  architecture rtl of mem_subsys is
19
- type ram_t is array (0 to 1023) of std_logic_vector(31 downto 0);
20
- signal ram: ram_t := (others => (others => '0'));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  begin
 
23
 
24
- process (clk)
25
- begin
26
- if rising_edge(clk) then
27
- if req.active = '1' then
28
- if req.cmd = MEM_CMD_WRITE then
29
- ram(to_integer(unsigned(req.address(11 downto 2)))) <= req.value;
30
- else
31
- res <= ram(to_integer(unsigned(req.address(11 downto 2))));
32
- end if;
33
- else
34
- res <= (others => '0');
35
- end if;
36
- end if;
37
- end process;
38
  end rtl;
 
16
 
17
 
18
  architecture rtl of mem_subsys is
19
+ component bram is
20
+ generic(
21
+ SIZE: integer := 1024;
22
+ ADDR_WIDTH: integer := 10;
23
+ COL_WIDTH: integer := 8;
24
+ NB_COL: integer := 4
25
+ );
26
+ port(
27
+ clka: in std_logic;
28
+ ena: in std_logic;
29
+ wea: in std_logic_vector(NB_COL - 1 downto 0);
30
+ addra: in std_logic_vector(ADDR_WIDTH - 1 downto 0);
31
+ dia: in std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0);
32
+ doa: out std_logic_vector(NB_COL * COL_WIDTH - 1 downto 0)
33
+ );
34
+ end component;
35
 
36
  begin
37
+ bram_inst: bram port map(clka => clk, ena => req.active, wea => (others => req.write), addra => req.address(11 downto 2), dia => req.value, doa => res);
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  end rtl;
src/types.vhd CHANGED
@@ -7,7 +7,7 @@ package types is
7
 
8
  type mem_req_t is record
9
  active: std_logic;
10
- cmd: mem_cmd_t;
11
  address: std_logic_vector(31 downto 0);
12
  value: std_logic_vector(31 downto 0);
13
  end record mem_req_t;
 
7
 
8
  type mem_req_t is record
9
  active: std_logic;
10
+ write: std_logic;
11
  address: std_logic_vector(31 downto 0);
12
  value: std_logic_vector(31 downto 0);
13
  end record mem_req_t;

In simulation we see that our memory subsystem works just as before. However, we now have a wea signal that we can use to implement writes that only write some bytes. We want to pass this directly from the execute stage so that we can implement halfword- and byte-sized loads and stores.

src/constants.vhd CHANGED
@@ -7,7 +7,7 @@ use work.types.all;
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
10
- write => '0',
11
  address => (others => '0'),
12
  value => (others => '0')
13
  );
 
7
  package constants is
8
  constant DEFAULT_MEM_REQ: mem_req_t := (
9
  active => '0',
10
+ write_enable => "0000",
11
  address => (others => '0'),
12
  value => (others => '0')
13
  );
src/core/execute.vhd CHANGED
@@ -141,13 +141,13 @@ begin
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
144
- v_mem_req.write := '1';
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
148
  v_output.use_mem := '1';
149
  v_mem_req.active := '1';
150
- v_mem_req.write := '0';
151
  v_mem_req.address := input.operand1;
152
  elsif input.operation = OP_LED then
153
  led <= input.operand1(7 downto 0);
 
141
  end if;
142
  elsif input.operation = OP_SW then
143
  v_mem_req.active := '1';
144
+ v_mem_req.write_enable := "1111";
145
  v_mem_req.address := input.operand1;
146
  v_mem_req.value := input.operand2;
147
  elsif input.operation = OP_LW then
148
  v_output.use_mem := '1';
149
  v_mem_req.active := '1';
150
+ v_mem_req.write_enable := "0000";
151
  v_mem_req.address := input.operand1;
152
  elsif input.operation = OP_LED then
153
  led <= input.operand1(7 downto 0);
src/mem_subsys.vhd CHANGED
@@ -34,6 +34,6 @@ architecture rtl of mem_subsys is
34
  end component;
35
 
36
  begin
37
- bram_inst: bram port map(clka => clk, ena => req.active, wea => (others => req.write), addra => req.address(11 downto 2), dia => req.value, doa => res);
38
 
39
  end rtl;
 
34
  end component;
35
 
36
  begin
37
+ bram_inst: bram port map(clka => clk, ena => req.active, wea => req.write_enable, addra => req.address(11 downto 2), dia => req.value, doa => res);
38
 
39
  end rtl;
src/types.vhd CHANGED
@@ -7,7 +7,7 @@ package types is
7
 
8
  type mem_req_t is record
9
  active: std_logic;
10
- write: std_logic;
11
  address: std_logic_vector(31 downto 0);
12
  value: std_logic_vector(31 downto 0);
13
  end record mem_req_t;
 
7
 
8
  type mem_req_t is record
9
  active: std_logic;
10
+ write_enable: std_logic_vector(3 downto 0);
11
  address: std_logic_vector(31 downto 0);
12
  value: std_logic_vector(31 downto 0);
13
  end record mem_req_t;