wsnの博客

记录学习经历和一点点日常

0%

计组实验——流水线CPU及两种数据冒险和阻塞的处理

流水线CPU相关概念

  1. CPU执行一条指令时分为五个阶段的:(1)在内存取指令(2)根据指令读寄存器(3)利用寄存器中的数据ALU(4)访问内存(5)写寄存器。一般是这五个阶段,但是很多指令并不是说这五个阶段全部都在做事情。比如add,它只有四个阶段,其中不涉及到内存的访问。但是,又有指令五个阶段都要做事情,比如lw。既然是通用的CPU,我们尽可能的支持夺得指令,或者说是一种短板效应。

  2. CPU如果是一条一条的执行指令,那么就会出现这种情况,比如add,在执行它的时候,他被执行到第二阶段,第一个取指令的操作就空了下来,同理,越来越多的操作被空闲。这显示是不行的,对于追求效率的CPU是不能容忍的,于是在基于工厂流水线的启发:提出了基于流水线形式工作的CPU。

  1. 图例:

1

​ 两种数据冒险

​ EXE级冒险 1:

2

​ EXE级冒险 2:

3

​ 阻塞:

image-20210619135550955

实现过程

pc模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
module pc(pc,clock,reset,npc,pc_write);
output [31:0] pc;
input clock;
input reset;
input [31:0] npc;
input pc_write;
reg [31:0] pc;
always@(posedge clock or negedge reset)
begin
if(reset == 0)
pc <= 32'h00003000;
else if(pc_write == 0)
pc <= pc;
else
pc <= npc;
end
endmodule

alu模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
`include "header.v"
module alu(c,a,b,aluop);
output reg [31:0] c;
input [31:0] a;
input [31:0] b;
input [3:0] aluop;
always @(*)
begin
case(aluop)
`ADDU : c = a + b;
`SUB : c = a - b;
`ADD : c = $signed(a) + $signed(b);
`AND : c = a & b;
`OR : c = a | b;
`SLT : c = ($signed(a) < $signed(b)) ? 1 : 0;
`LUI : c = b << 16;
default: c = 32'b0;
endcase
end
endmodule

bypass 模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
module bypass(forwardA,forwardB,rs,rt,num_write_1,num_write_2,regwrite_1,regwrite_2);
output reg [1:0] forwardA,forwardB;
input [4:0] rs,rt,num_write_1,num_write_2;
input regwrite_1,regwrite_2;
always@(*)
begin
forwardA = 2'b00;
forwardB = 2'b00;
if(regwrite_2 == 1 )
begin
if(num_write_2 != 0)
begin
if(rs == num_write_2)
forwardA = 2'b01;
if(rt == num_write_2)
forwardB = 2'b01;
end
end
if(regwrite_1 == 1 )
begin
if(num_write_1 != 0)
begin
if(rs == num_write_1)
forwardA = 2'b10;
if(rt == num_write_1)
forwardB = 2'b10;
end
end

end
endmodule

ctrl模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
`include "header.v"
module ctrl(op,funct,reg_write,aluop,s_ext,mem_write,s_data_write,s_b,s_num_write);
input [5:0] op;
input [5:0] funct;
output reg [3:0] aluop;
output reg reg_write;
output reg s_b;
output reg mem_write;
output reg s_ext;
output reg s_data_write;
output reg s_num_write;
always @(*)
begin
reg_write = 1'b0; s_b = 1'b0; s_ext = 1'b0; s_num_write = 1'b0; aluop = 4'bxxxx; mem_write = 1'b0; s_data_write = 1'b0;
case(op)
`op_R:
begin
reg_write = 1; s_b = 0; s_ext = 0; s_num_write = 0; mem_write = 0; s_data_write = 0;
case (funct)
`F_ADDU: aluop = `ADDU;
`F_SUBU: aluop = `SUB;
`F_ADD: aluop = `ADD;
`F_AND: aluop = `AND;
`F_OR: aluop = `OR;
`F_SLT: aluop = `SLT;
default: aluop = 4'bxxxx;
endcase
end
`op_addi: begin reg_write = 1; s_b = 1; s_ext = 1; s_num_write = 1; aluop = `ADD; mem_write = 0; s_data_write = 0; end
`op_addiu: begin reg_write = 1; s_b = 1; s_ext = 1; s_num_write = 1; aluop = `ADD; mem_write = 0; s_data_write = 0; end
`op_andi: begin reg_write = 1; s_b = 1; s_ext = 0; s_num_write = 1; aluop = `AND; mem_write = 0; s_data_write = 0; end
`op_ori: begin reg_write = 1; s_b = 1; s_ext = 0; s_num_write = 1; aluop = `OR; mem_write = 0; s_data_write = 0; end
`op_lui: begin reg_write = 1; s_b = 1; s_ext = 0; s_num_write = 1; aluop = `LUI; mem_write = 0; s_data_write = 0; end
`op_lw: begin reg_write = 1; s_b = 1; s_ext = 1; s_num_write = 1; aluop = `ADD; mem_write = 0; s_data_write = 1 ; end
`op_sw: begin reg_write = 0; s_b = 1; s_ext = 1; aluop = `ADD; mem_write = 1; end
endcase
end

endmodule

dm模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
module dm(data_out,clock,mem_write,address,data_in); 
output [31:0] data_out;
input clock;
input mem_write;
input [31:0] address;
input [31:0] data_in;
reg [31:0] data_memory[1023:0]; //4K数据存储器

assign data_out = data_memory[address[11:2]];
always @(posedge clock)
begin
if (mem_write)
data_memory[address[11:2]] <= data_in;
end
endmodule

exe_mem寄存器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
module exe_mem (
n_c, n_b, n_num_write, n_mem_write, n_s_data_write,
c, b, num_write, mem_write, s_data_write,
n_regwrite,regwrite,clock, reset
);
input [31:0] n_c, n_b;
input [4:0] n_num_write;
input n_s_data_write, n_mem_write, n_regwrite,clock,reset;
output reg [31:0] c, b;
output reg [4:0] num_write;
output reg mem_write, s_data_write, regwrite;
always @(posedge clock or negedge reset) begin
if(reset == 0)
begin
c <= 32'b0;
b <= 32'b0;
num_write <= 5'b0;
end
else
begin
c <= n_c;
b <= n_b;
num_write <= n_num_write;
s_data_write <= n_s_data_write;
mem_write <= n_mem_write;
regwrite <= n_regwrite;
end

end
endmodule

ext模块

1
2
3
4
5
6
7
8
9
10
11
12
13
module ext(immediate,extop,extimmediate);
input [15:0]immediate;
input extop;
output reg [31:0] extimmediate;
always @(*)
begin
if(extop)
extimmediate <= {{16{immediate[15]}},immediate[15:0]};
else
extimmediate <= {{16{1'b0}},immediate[15:0]};
end

endmodule

gpr模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
module gpr(a,b,clock,reg_write,num_write,rs,rt,data_write);
output [31:0] a;
output [31:0] b;
input clock;
input reg_write;
input [4:0] rs; //读寄存器1
input [4:0] rt; //读寄存器2
input [4:0] num_write; //写寄存器
input [31:0] data_write; //写数据
reg [31:0] gp_registers[31:0]; //32个寄存器
always @(*)
begin
gp_registers[0] <= 32'b0;
end
assign a = rs ? gp_registers[rs] : 32'b0;
assign b = rt ? gp_registers[rt] : 32'b0;
always @(posedge clock)
begin
if(reg_write)
gp_registers[num_write] <= data_write;
end

endmodule

宏定义模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49



//alu op
`define ADDU 4'b0001
`define SUB 4'b0011
`define ADD 4'b0000
`define AND 4'b0100
`define OR 4'b0101
`define SLT 4'b1010
`define LUI 4'b1011
`define EQB 4'b1100
// R型op
`define op_R 6'b000000

//R型funct字段
`define F_ADDU 6'b100001
`define F_SUBU 6'b100011
`define F_ADD 6'b100000
`define F_AND 6'b100100
`define F_OR 6'b100101
`define F_SLT 6'b101010
`define F_JR 6'b001000

//I型op字段
`define op_addi 6'b001000
`define op_addiu 6'b001001
`define op_andi 6'b001100
`define op_ori 6'b001101
`define op_lui 6'b001111

//MEM型op字段
`define op_sw 6'b101011
`define op_lw 6'b100011

//跳转指令
`define op_beq 6'b000100
`define op_j 6'b000010
`define op_jal 6'b000011


//mux4to1
`define FIRST 2'b00
`define SECOND 2'b01
`define THIRD 2'b10
`define FOUR 2'b11



id_exe 寄存器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
module id_exe(n_b,b,n_a,a,ext_imm,n_ext_imm,n_aluop,
aluop,n_rd,rd,n_s_num_write,s_num_write,n_s_b,s_b,
n_mem_write, mem_write, n_s_data_write, s_data_write,
n_regwrite, regwrite,n_rs,n_rt,rs,rt,
id_exe_flash,clock,reset);
input [31:0] n_b, n_a, n_ext_imm;
input [4:0] n_rd,n_rs,n_rt;
input [3:0] n_aluop;
input clock,reset, n_s_b, n_s_data_write, n_mem_write,n_regwrite,n_s_num_write,id_exe_flash;
output reg [31:0] a,b,ext_imm;
output reg [4:0] rd,rs,rt;
output reg [3:0] aluop;
output reg s_b, s_data_write, mem_write,regwrite,s_num_write;
always@(posedge clock or negedge reset)
begin
if(reset == 0)
begin
a <= 32'b0;
b <= 32'b0;
ext_imm <= 32'b0;
rd <= 5'b0;
rs <= 5'b0;
rt <= 5'b0;
end
else
begin
a <= n_a;
b <= n_b;
ext_imm <= n_ext_imm;
rd <= n_rd;
rs <= n_rs;
rt <= n_rt;
aluop <= n_aluop;
s_b <= n_s_b;
mem_write <= n_mem_write;
s_data_write <= n_s_data_write;
regwrite <= n_regwrite;
s_num_write <= n_s_num_write;
if(id_exe_flash)
begin
aluop <= 4'bxxxx;
s_b <= 1'b0;
mem_write <= 1'b0;
s_data_write <= 1'b0;
regwrite <= 1'b0;
s_num_write <= 1'b0;
a <= 32'b0;
b <= 32'b0;
ext_imm <= 32'b0;
end
end

end
endmodule

if_id寄存器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
module if_id(instruction,clock,reset,n_instruction,if_id_write);
input if_id_write,clock,reset;
input [31:0] n_instruction;
output reg [31:0] instruction;
always @(posedge clock or negedge reset) begin
if(reset == 0)
instruction <= 32'h00000000;
else if(if_id_write == 0)
instruction <= instruction;
else
instruction <= n_instruction;
end


endmodule

im模块

1
2
3
4
5
6
7
8
9
10
module im(instruction,pc);
output [31:0] instruction;

input [31:0] pc;

reg [31:0] ins_memory[1023:0];//4k指令存储器

assign instruction = ins_memory[pc[11:2]];

endmodule

mem_wb寄存器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
module mem_wb(n_c, c, n_data_out, data_out, n_s_data_write, s_data_write,
n_num_write, num_write, n_regwrite, regwrite, clock, reset);
input [31:0] n_c, n_data_out;
input [4:0] n_num_write;
input n_s_data_write, n_regwrite, clock,reset;
output reg [31:0] c, data_out;
output reg [4:0] num_write;
output reg s_data_write,regwrite;
always @(posedge clock or negedge reset) begin
if(reset == 0)
begin
c <= 32'b0;
data_out <= 32'b0;
num_write <= 5'b0;
end
else
begin
c <= n_c;
data_out <= n_data_out;
num_write <= n_num_write;
s_data_write <= n_s_data_write;
regwrite <= n_regwrite;
end

end

endmodule

mux2模块

1
2
3
4
5
6
7
8
9
10
11
12
13
module mux2(out,data1,data2,s_flag);
output reg [31:0] out;
input [31:0] data1;
input [31:0] data2;
input s_flag;
always@(*)
begin
if(s_flag)
out <= data1;
else
out <= data2;
end
endmodule

mux3模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
`include"header.v"
module mux3 (out,data1,data2,data3,s_flag);
output reg [31:0] out;
input [31:0] data1;
input [31:0] data2;
input [31:0] data3;
input [1:0] s_flag;
always@(*)
begin
case(s_flag)
2'b00: out <= data1;
2'b01: out <= data2;
2'b10: out <= data3;
default: out <= 32'bx;
endcase
end
endmodule

顶层模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
module pipeline_cpu(clock,reset);
input clock,reset;
wire [31:0] pc;
wire [31:0] npc;
wire [4:0] num_write,n1_num_write,n2_num_write,n3_num_write;
wire [31:0] data_write;
wire [31:0] instruction,n_instruction;
//控制信号
wire [3:0] aluop,n_aluop;
wire extop;//判断扩展方式
wire regwrite, n1_regwrite,n2_regwrite,n3_regwrite;
wire s_b, n_s_b;
wire mem_write, n1_mem_write, n2_mem_write;
wire s_num_write,n_s_num_write;
wire s_data_write, n1_s_data_write, n2_s_data_write, n3_s_data_write;
wire [15:0] imm;
wire [31:0] ext_imm,n_ext_imm;
wire [4:0] rs,rt,rd,n_rs,n_rt,n_rd;
wire [31:0] a, n_a,f_a, b,f_b,n1_b,n2_b, n1_c, n2_c, c, bb, n_data_out, data_out;
wire [1:0] forwardA,forwardB;
assign npc = pc + 4;
wire pc_write,if_id_write,id_exe_flash;

pc PC(.pc(pc),.clock(clock),.reset(reset),.npc(npc),.pc_write(pc_write));
im IM(.instruction(n_instruction),.pc(pc));
if_id IF_ID(.instruction(instruction),.clock(clock),.reset(reset),.n_instruction(n_instruction),.if_id_write(if_id_write));
//第二部分
assign n_rs = instruction[25:21];
assign n_rt = instruction[20:16];
assign n_rd = instruction[15:11];
assign imm = instruction[15:0];

ctrl CTRL(.op(instruction[31:26]),.funct(instruction[5:0]),.reg_write(n1_regwrite),.aluop(n_aluop),
.s_ext(extop),.mem_write(n1_mem_write),.s_data_write(n1_s_data_write),.s_b(n_s_b),.s_num_write(n_s_num_write));

// mux2 #(5) MUX_NUM_WRITE(.out(n1_num_write),.data1(rd),.data2(rt),.s_flag(s_num_write));

ext EXT(.immediate(imm),.extop(extop),.extimmediate(n_ext_imm));
gpr GPR(.a(n_a),.b(n1_b),.clock(clock),.reg_write(regwrite),.num_write(num_write),.rs(n_rs),.rt(n_rt),
.data_write(data_write));
id_exe ID_EXE(.n_b(n1_b),.b(n2_b),.n_a(n_a),.a(a),.ext_imm(ext_imm),.n_ext_imm(n_ext_imm),.n_aluop(n_aluop),
.aluop(aluop),.n_s_b(n_s_b),.s_b(s_b),.n_rd(n_rd),.rd(rd),.n_s_num_write(n_s_num_write),
.s_num_write(s_num_write),.n_mem_write(n1_mem_write),.mem_write(n2_mem_write),
.n_s_data_write(n1_s_data_write),.s_data_write(n2_s_data_write),.n_regwrite(n1_regwrite),
.regwrite(n2_regwrite), .n_rs(n_rs),.n_rt(n_rt),.rs(rs),.rt(rt),.id_exe_flash(id_exe_flash),
.clock(clock),.reset(reset));

assign n2_num_write = s_num_write ? rt : rd;
stall STALL(.pc_write(pc_write),.if_id_write(if_id_write),.id_exe_flash(id_exe_flash),.mem_read(n2_s_data_write),
.rt_f(rt),.rt(n_rt),.rs(n_rs));
bypass BYPASS(.forwardA(forwardA),.forwardB(forwardB),.rs(rs),.rt(rt),.num_write_1(n3_num_write),
.num_write_2(num_write),.regwrite_1(n3_regwrite),.regwrite_2(regwrite));
//第三部分
mux3 MUX_A(.out(f_a),.data1(a),.data2(data_write),.data3(n2_c),.s_flag(forwardA));
mux3 MUX_B(.out(f_b),.data1(n2_b),.data2(data_write),.data3(n2_c),.s_flag(forwardB));
mux2 MUX_BB(.out(bb),.data1(ext_imm),.data2(f_b),.s_flag(s_b));
alu ALU(.c(n1_c),.a(f_a),.b(bb),.aluop(aluop));
exe_mem EXE_MEM(.n_c(n1_c), .n_b(f_b),.n_num_write(n2_num_write), .n_mem_write(n2_mem_write), .n_s_data_write(n2_s_data_write),
.c(n2_c), .b(b), .num_write(n3_num_write), .mem_write(mem_write), .s_data_write(n3_s_data_write),.n_regwrite(n2_regwrite),
.regwrite(n3_regwrite), .clock(clock),.reset(reset));

//第四部分
wire [31:0] data_in;

dm DM(.data_out(n_data_out),.clock(clock),.mem_write(mem_write),.address(n2_c),.data_in(b));
mem_wb MEM_WB(.n_c(n2_c), .c(c), .n_data_out(n_data_out), .data_out(data_out), .n_s_data_write(n3_s_data_write), .s_data_write(s_data_write),
.n_num_write(n3_num_write),.num_write(num_write),.n_regwrite(n3_regwrite),.regwrite(regwrite),.clock(clock),.reset(reset));

mux2 MUX_DATA_WRITE(.out(data_write),.data1(data_out),.data2(c),.s_flag(s_data_write));



endmodule

测试代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
`timescale 10ns / 1ns
module pipeline_cpu_test;

reg CLOCK, RESET;

pipeline_cpu PIPELINE_CPU(CLOCK, RESET);

initial
begin
CLOCK = 0;
forever #5 CLOCK = ~CLOCK;
end

integer i;
initial
begin
$readmemh("code.txt", PIPELINE_CPU.IM.ins_memory);
for(i=0; i<32; i=i+1)
PIPELINE_CPU.GPR.gp_registers[i] = i;
for(i=0; i<32; i=i+1)
PIPELINE_CPU.DM.data_memory[i] = 0;
end

initial
begin
RESET = 0;
#5 RESET = 1;
#100 for(i=0; i<32; i=i+1)
$display("gp_registers[%2d] = %h", i,PIPELINE_CPU.GPR.gp_registers[i]);
for(i=0; i<32; i=i+1)
$display("data_memory[%2d] = %8h",i,PIPELINE_CPU.DM.data_memory[i]);

$stop;
end

endmodule