CoreMark移植

功能验证和CSR都做完了,该跑个分了。

CoreMark 是由 EEMBC 推出的一个处理器基准测试(benchmark),主要用来衡量 嵌入式处理器、MCU 和 CPU 内核的整数运算性能。它的设计目标是尽量只测试“处理器核心本身”的能力,而不过多受操作系统、外设或平台差异影响,因此很常被用来做不同芯片之间的横向比较。

源代码获取

源代码可以在eembc/coremark获取。

目录结构

CoreMark的库底下有很多文件。我们只需要关心barebones文件夹下的内容即可,这是适用于裸机的移植方案。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
├── barebones             # barebones 移植
│ │ # 这两个文件需要修改
│ ├── core_portme.c # coremark 的平台移植层实现模板
│ ├── core_portme.h # 核心可移植层头文件
│ │
│ ├── core_portme.mak # Makefile
│ ├── cvt.c # 用于浮点数到字符串转换的实现 可不需要
│ └── ee_printf.c # printf 格式化实现

├── barebones_porting.md # barebones移植说明

├── core_list_join.c # 以下六个为 coremark 核心代码
├── core_main.c # 原则上不需要修改
├── core_matrix.c # 修改也禁止修改算法相关
├── core_state.c # 需保证计算后的校验值一致
├── core_util.c #
├── coremark.h #

├── Makefile # Makefile
└── README.md # 移植说明

soft_div.c

我们的CPU只支持zmmul,仅支持硬件乘法而不支持硬件除法。CoreMark 的运算需要用到除法,因此我们要加上软件除法和取模实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*
* 软件除法和取模实现
* 适用于只有硬件乘法的 RISC-V CPU
*/

// 32位无符号除法(长除法算法)
unsigned int __udivsi3(unsigned int dividend, unsigned int divisor)
{
unsigned int quotient = 0;
unsigned int remainder = dividend;
int bit;

if (divisor == 0)
return 0; // 除零保护

// 找到最高位
for (bit = 31; bit >= 0; bit--)
{
// 防止左移溢出:只有当 divisor <= (0xFFFFFFFF >> bit) 时才进行移位比较
// 否则 divisor << bit 会溢出并产生错误结果
if (divisor <= (0xFFFFFFFFU >> bit))
{
unsigned int shifted = divisor << bit;
if (remainder >= shifted)
{
remainder -= shifted;
quotient |= (1U << bit);
}
}
}

return quotient;
}

// 32位无符号取模
unsigned int __umodsi3(unsigned int dividend, unsigned int divisor)
{
unsigned int remainder = dividend;
int bit;

if (divisor == 0)
return dividend; // 除零保护

for (bit = 31; bit >= 0; bit--)
{
// 防止左移溢出:只有当 divisor <= (0xFFFFFFFF >> bit) 时才进行移位比较
if (divisor <= (0xFFFFFFFFU >> bit))
{
unsigned int shifted = divisor << bit;
if (remainder >= shifted)
{
remainder -= shifted;
}
}
}

return remainder;
}

// 32位有符号除法
int __divsi3(int dividend, int divisor)
{
int negative = 0;
unsigned int result;

// 处理符号
if (dividend < 0)
{
dividend = -dividend;
negative = !negative;
}
if (divisor < 0)
{
divisor = -divisor;
negative = !negative;
}

result = __udivsi3((unsigned int)dividend, (unsigned int)divisor);

return negative ? -(int)result : (int)result;
}

// 32位有符号取模
int __modsi3(int dividend, int divisor)
{
int sign = (dividend < 0) ? -1 : 1;
unsigned int result;

if (dividend < 0)
dividend = -dividend;
if (divisor < 0)
divisor = -divisor;

result = __umodsi3((unsigned int)dividend, (unsigned int)divisor);

return sign * (int)result;
}

// 64位无符号除法(长除法算法)
unsigned long long __udivdi3(unsigned long long dividend, unsigned long long divisor)
{
unsigned long long quotient = 0;
unsigned long long remainder = dividend;
int bit;

if (divisor == 0)
return 0; // 除零保护

// 找到最高位
for (bit = 63; bit >= 0; bit--)
{
// 防止左移溢出
if (divisor <= (0xFFFFFFFFFFFFFFFFULL >> bit))
{
unsigned long long shifted = divisor << bit;
if (remainder >= shifted)
{
remainder -= shifted;
quotient |= (1ULL << bit);
}
}
}

return quotient;
}

// 64位无符号取模
unsigned long long __umoddi3(unsigned long long dividend, unsigned long long divisor)
{
unsigned long long remainder = dividend;
int bit;

if (divisor == 0)
return dividend; // 除零保护

for (bit = 63; bit >= 0; bit--)
{
if (divisor <= (0xFFFFFFFFFFFFFFFFULL >> bit))
{
unsigned long long shifted = divisor << bit;
if (remainder >= shifted)
{
remainder -= shifted;
}
}
}

return remainder;
}

// 64位有符号除法
long long __divdi3(long long dividend, long long divisor)
{
int negative = 0;
unsigned long long result;

if (dividend < 0)
{
dividend = -dividend;
negative = !negative;
}
if (divisor < 0)
{
divisor = -divisor;
negative = !negative;
}

result = __udivdi3((unsigned long long)dividend, (unsigned long long)divisor);

return negative ? -(long long)result : (long long)result;
}

// 64位有符号取模
long long __moddi3(long long dividend, long long divisor)
{
int sign = (dividend < 0) ? -1 : 1;
unsigned long long result;

if (dividend < 0)
dividend = -dividend;
if (divisor < 0)
divisor = -divisor;

result = __umoddi3((unsigned long long)dividend, (unsigned long long)divisor);

return sign * (long long)result;
}

core_portme.h

这个文件是核心可移植层的头文件,定义了平台能力宏、基本数据类型别名、定时/计时类型等关键参数。

我们要在最开始加上#include <stddef.h>

此外,因为我们是RV32I,不支持浮点数,要将HAS_FLOAT设置为0;同理,我们不使用time.h库,全靠裸机的寄存器读取来判断当前时间,因此设置HAS_TIME_H为0。

为了方便后续操作,再新定义一个数据类型ee_u64,和long long一致。

最后,我们是裸机直接运行,用iverilog进行仿真,因此无法传入参数,需要将MAIN_HAS_NOARGC设置为1。

core_portme.c

这个文件是 coremark 的平台移植层实现模板,需要修改定时接口等基本框架。

别忘了上面的软件除法:

1
2
3
4
5
6
/* 外部声明软件除法函数 */
extern unsigned int __udivsi3(unsigned int, unsigned int);
extern unsigned int __umodsi3(unsigned int, unsigned int);
/* 64位除法函数声明 */
extern unsigned long long __udivdi3(unsigned long long, unsigned long long);
extern unsigned long long __umoddi3(unsigned long long, unsigned long long);

然后,根据我们的时间计数器MCYCLEMCYCLEH,自行定义获取时间计数的函数,并将其绑定到barebones_clock内:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* Read mcycle CSR (cycle counter) - returns full 64 bits
* Uses atomic read sequence to handle overflow between mcycleh and mcycle reads.
* mcycle: 0xB00 (low 32 bits)
* mcycleh: 0xB80 (high 32 bits)
*/
static inline ee_u64 read_mcycle64(void)
{
ee_u32 hi1, lo, hi2;
/* Atomic read sequence: read high, low, high again.
* If high changed, low overflowed, so retry. */
do
{
__asm__ volatile("csrr %0, mcycleh" : "=r"(hi1));
__asm__ volatile("csrr %0, mcycle" : "=r"(lo));
__asm__ volatile("csrr %0, mcycleh" : "=r"(hi2));
} while (hi1 != hi2);
return ((ee_u64)hi1 << 32) | (ee_u64)lo;
}

CORETIMETYPE // 使用 ee_u64 数据类型 因寄存器拼合为64位
barebones_clock()
{
return read_mcycle64();
}

接着,设置秒数计算的函数time_in_secs

1
2
3
4
5
6
7
8
9
secs_ret
time_in_secs(CORE_TICKS ticks)
{
/* Perform 64-bit division, then cast result to secs_ret (ee_u32).
* Since seconds is a small number, 32-bit result is sufficient. */
ee_u64 secs_64 = ticks / (ee_u64)EE_TICKS_PER_SEC;
secs_ret retval = (secs_ret)secs_64;
return retval;
}

ee_printf.c

我们需要使用iverilog进行运行,并输出。怎么做呢?我们可以划分一个独特的内存地址,并进行监控,当该内存地址写入值之后,将值打印出来。这就是WRITE_TOHOST的监控方式。

在开头设定:#define TOHOST (*(volatile ee_u32*)0x0D000720)

然后内部的HAS_FLOAT相关内容都可以删去。

之后,我们重新定义字符发送函数,和ee_printf函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include <stdint.h>

static inline void putch(char c)
{
TOHOST = (uint32_t)(uint8_t)c;
}

static void print_hex32(uint32_t v, int width)
{
static const char* hex = "0123456789abcdef";
if (width <= 0)
width = 8;
for (int i = (width - 1) * 4; i >= 0; i -= 4)
putch(hex[(v >> i) & 0xF]);
}

static void print_udec32(uint32_t x)
{
char buf[16];
int i = 0;

if (x == 0)
{
putch('0');
return;
}

while (x)
{
buf[i++] = '0' + (x % 10);
x /= 10;
}
while (i--) putch(buf[i]);
}

static void print_dec32(int32_t v)
{
if (v < 0)
{
putch('-');
print_udec32((uint32_t)(-v));
}
else
{
print_udec32((uint32_t)v);
}
}

int ee_printf(const char* fmt, ...)
{
va_list ap;
va_start(ap, fmt);

while (*fmt)
{
if (*fmt != '%')
{
putch(*fmt++);
continue;
}

fmt++; // skip '%'

// "%%"
if (*fmt == '%')
{
putch('%');
fmt++;
continue;
}

// 解析可选 '0'
int zero_pad = 0;
if (*fmt == '0')
{
zero_pad = 1;
fmt++;
}

// 解析宽度
int width = 0;
while (*fmt >= '0' && *fmt <= '9')
{
width = width * 10 + (*fmt - '0');
fmt++;
}

// ✅ 解析长度修饰符:只支持 'l'
int is_long = 0;
if (*fmt == 'l')
{
is_long = 1;
fmt++;
}

char f = *fmt++;

switch (f)
{
case 'c':
{
char c = (char)va_arg(ap, int);
putch(c);
break;
}
case 's':
{
const char* s = va_arg(ap, const char*);
if (!s)
s = "(null)";
while (*s) putch(*s++);
break;
}
case 'x':
{
uint32_t v;
if (is_long)
v = (uint32_t)va_arg(ap, unsigned long); // RV32: 32-bit
else
v = (uint32_t)va_arg(ap, unsigned int);

if (width == 0)
width = 8;
print_hex32(v, width);
break;
}
case 'u':
{
uint32_t v;
if (is_long)
v = (uint32_t)va_arg(ap, unsigned long);
else
v = (uint32_t)va_arg(ap, unsigned int);

print_udec32(v);
break;
}
case 'd':
{
int32_t v;
if (is_long)
v = (int32_t)va_arg(ap, long);
else
v = (int32_t)va_arg(ap, int);

print_dec32(v);
break;
}
default:
// unknown specifier
putch('%');
if (is_long)
putch('l');
putch(f);
break;
}
}

va_end(ap);
return 0;
}

至此,输出部分修改完毕。

core_main.c

这里就是关键的主入口了。我们需要修改一部分函数。

在开头加上自定义的运行完毕返回函数:

1
2
3
4
5
6
7
8
#define TOHOST (*(volatile ee_u32*)0x0D000720)
static void exit_sim(ee_u32 code)
{
TOHOST = code; // code=1 => PASS
while (1)
{
} // 保险起见,避免继续跑
}

随后继续加上read_mcycle64

1
2
3
4
5
6
7
8
9
10
11
12
13
static inline ee_u64 read_mcycle64(void)
{
ee_u32 hi1, lo, hi2;
/* Atomic read sequence: read high, low, high again.
* If high changed, low overflowed, so retry. */
do
{
__asm__ volatile("csrr %0, mcycleh" : "=r"(hi1));
__asm__ volatile("csrr %0, mcycle" : "=r"(lo));
__asm__ volatile("csrr %0, mcycleh" : "=r"(hi2));
} while (hi1 != hi2);
return ((ee_u64)hi1 << 32) | (ee_u64)lo;
}

关键的计算校验值部分在282行开始的地方:

1
2
3
4
5
6
7
stop_time();
total_time = get_time();
/* get a function of the input to report */
seedcrc = crc16(results[0].seed1, seedcrc);
seedcrc = crc16(results[0].seed2, seedcrc);
seedcrc = crc16(results[0].seed3, seedcrc);
seedcrc = crc16(results[0].size, seedcrc);

这里,如果不在每个crc更新之间插入一个ee_printf,会导致最终的校验值不正常,非常奇怪。原因尚不明确。

1
2
3
4
5
6
7
seedcrc    = crc16(results[0].seed1, seedcrc);
ee_printf(""); // 手动插入空的 ee_printf
seedcrc = crc16(results[0].seed2, seedcrc);
ee_printf("");
seedcrc = crc16(results[0].seed3, seedcrc);
ee_printf("");
seedcrc = crc16(results[0].size, seedcrc);

因为我们是iverilog仿真,不需要跑很多轮,因此可以将367行的total_errors++注释掉。

最后,需要在415行开始的程序运行结束部分加入自己的运行完毕返回函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
if (total_errors > 0)
{
ee_printf("Errors detected\n");
exit_sim(2);
}
if (total_errors < 0)
{
ee_printf(
"Cannot validate operation for these seed values, please compare "
"with results on a known platform.\n");
exit_sim(2);
}

#if (MEM_METHOD == MEM_MALLOC)
for (i = 0; i < MULTITHREAD; i++)
portable_free(results[i].memblock[0]);
#endif
/* And last call any target specific code for finalizing */
portable_fini(&(results[0].port));
exit_sim(1);
return MAIN_RETURN_VAL;

至此,CoreMark库修改完毕。

编译与链接相关

start.S

我们编译需要start.S作为link.ld的入口。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
    .section .text.init
.globl _start
_start:
la sp, _stack_top

# 清 BSS(如果你没定义 __bss_start/__bss_end 就先不做)
# la t0, __bss_start
# la t1, __bss_end
# ...

# call print_test
call main
1:
j 1b

链接脚本link.ld用于告诉链接器最终生成的程序该怎么排布到内存里。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
OUTPUT_ARCH("riscv")
ENTRY(_start)

SECTIONS
{
/* 你的 CPU reset 从 0 开始取指 */
. = 0x00000000;

/* 启动入口,放在最前面 */
.text.init : {
*(.text.init)
}

/* 不要再 ALIGN(0x1000)!否则 .text 会被推到 0x1000 */
.text : {
*(.text .text.*)
*(.rodata .rodata.*)
}

.data : {
*(.data .data.*)
}

.bss : {
*(.bss .bss.*)
*(COMMON)
}

_stack_top = 0x0000FF00;
_end = .;
}

ASSERT(_end < _stack_top - 0x1000, "ERROR: program too large / stack overlap!");

core_portme.mak

自己写一个:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original Author: Shay Gal-on

#File : core_portme.mak

# Flag : OUTFLAG
# Use this flag to define how to to get an executable (e.g -o)
OUTFLAG= -o
# Flag : CC
# Use this flag to define compiler to use
CC = gcc
# Flag : LD
# Use this flag to define compiler to use
LD = gld
# Flag : AS
# Use this flag to define compiler to use
AS = gas
# Flag : CFLAGS
# Use this flag to define compiler options. Note, you can add compiler options from the command line using XCFLAGS="other flags"
PORT_CFLAGS = -O0 -g
FLAGS_STR = "$(PORT_CFLAGS) $(XCFLAGS) $(XLFLAGS) $(LFLAGS_END)"
CFLAGS = $(PORT_CFLAGS) -I$(PORT_DIR) -I. -DFLAGS_STR=\"$(FLAGS_STR)\"
#Flag : LFLAGS_END
# Define any libraries needed for linking or other flags that should come at the end of the link line (e.g. linker scripts).
# Note : On certain platforms, the default clock_gettime implementation is supported but requires linking of librt.
SEPARATE_COMPILE=1
# Flag : SEPARATE_COMPILE
# You must also define below how to create an object file, and how to link.
OBJOUT = -o
LFLAGS =
ASFLAGS =
OFLAG = -o
COUT = -c

LFLAGS_END =
# Flag : PORT_SRCS
# Port specific source files can be added here
# You may also need cvt.c if the fcvt functions are not provided as intrinsics by your compiler!
PORT_SRCS = $(PORT_DIR)/core_portme.c $(PORT_DIR)/ee_printf.c
vpath %.c $(PORT_DIR)
vpath %.s $(PORT_DIR)

# Flag : LOAD
# For a simple port, we assume self hosted compile and run, no load needed.

# Flag : RUN
# For a simple port, we assume self hosted compile and run, simple invocation of the executable

LOAD = echo "Please set LOAD to the process of loading the executable to the flash"
RUN = echo "Please set LOAD to the process of running the executable (e.g. via jtag, or board reset)"

OEXT = .o
EXE = .bin

$(OPATH)$(PORT_DIR)/%$(OEXT) : %.c
$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@

$(OPATH)%$(OEXT) : %.c
$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@

$(OPATH)$(PORT_DIR)/%$(OEXT) : %.s
$(AS) $(ASFLAGS) $< $(OBJOUT) $@

# Target : port_pre% and port_post%
# For the purpose of this simple port, no pre or post steps needed.

.PHONY : port_prebuild port_postbuild port_prerun port_postrun port_preload port_postload
port_pre% port_post% :

# FLAG : OPATH
# Path to the output folder. Default - current folder.
OPATH = ./
MKDIR = mkdir -p

顶层模块

我们需要写一个针对CoreMark测试的顶层模块。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
`timescale 1ns / 1ps

`include "../src/CPU_TOP.sv"
`define DEBUG

`define REG_FILE u_CPU_TOP.u_registerf
// verilog_format: off
module coremark;

// 时钟和复位信号
logic clk;
logic rst_n;

// IROM 信号
logic [13:0] irom_addr;
logic [31:0] irom_data;

// 实例化 IROM (指令存储器)
IROM #(
.ADDR_WIDTH(14)
) u_IROM (
.a (irom_addr),
.spo(irom_data)
);

// 实例化 CPU_TOP
CPU_TOP u_CPU_TOP (
.clk (clk),
.rst_n(rst_n),
.instr(irom_data),
.pc (irom_addr)
);

// 寄存器堆监控信号
logic [31:0] x0, x1, x2, x3, x4, x5, x6, x7,
x8, x9, x10, x11, x12, x13, x14, x15,
x16, x17, x18, x19, x20, x21, x22, x23,
x24, x25, x26, x27, x28, x29, x30, x31;

always_comb begin
x0 = `REG_FILE.rf_in[0];
x1 = `REG_FILE.rf_in[1];
x2 = `REG_FILE.rf_in[2];
x3 = `REG_FILE.rf_in[3];
x4 = `REG_FILE.rf_in[4];
x5 = `REG_FILE.rf_in[5];
x6 = `REG_FILE.rf_in[6];
x7 = `REG_FILE.rf_in[7];
x8 = `REG_FILE.rf_in[8];
x9 = `REG_FILE.rf_in[9];
x10 = `REG_FILE.rf_in[10];
x11 = `REG_FILE.rf_in[11];
x12 = `REG_FILE.rf_in[12];
x13 = `REG_FILE.rf_in[13];
x14 = `REG_FILE.rf_in[14];
x15 = `REG_FILE.rf_in[15];
x16 = `REG_FILE.rf_in[16];
x17 = `REG_FILE.rf_in[17];
x18 = `REG_FILE.rf_in[18];
x19 = `REG_FILE.rf_in[19];
x20 = `REG_FILE.rf_in[20];
x21 = `REG_FILE.rf_in[21];
x22 = `REG_FILE.rf_in[22];
x23 = `REG_FILE.rf_in[23];
x24 = `REG_FILE.rf_in[24];
x25 = `REG_FILE.rf_in[25];
x26 = `REG_FILE.rf_in[26];
x27 = `REG_FILE.rf_in[27];
x28 = `REG_FILE.rf_in[28];
x29 = `REG_FILE.rf_in[29];
x30 = `REG_FILE.rf_in[30];
x31 = `REG_FILE.rf_in[31];
end

// 时钟生成 (100MHz, 周期 10ns)、
// verilog_format: on
initial begin
clk = 0;
forever #5 clk = ~clk;
end


localparam integer TOHOST_ADDR = 32'h0d000720; // 注意:根据你的链接脚本调整!

always_ff @(posedge clk) begin
if (u_CPU_TOP.dram_we_MEM && u_CPU_TOP.alu_result_MEM == TOHOST_ADDR) begin
// ✅ 读取要写入 tohost 的数据
logic [31:0] tohost_data;
tohost_data = u_CPU_TOP.rf_rd2_MEM;

// 退出码判断
if (tohost_data == 32'd1) begin
$display("%10t| [PASS] | Finished ", $time);
$finish;
end else if (tohost_data == 32'd2) begin
$display("%10t| [FAIL] | Finished ", $time);
$finish;
end else begin
// 普通字符输出
$write("%c", tohost_data[7:0]);
$fflush();
end
end
end

// 复位和测试控制
initial begin
// 波形文件设置
integer dumpwave;
if ($value$plusargs("DUMPWAVE=%d", dumpwave)) begin
if (dumpwave == 1) begin
`ifdef VCD_FILEPATH
$dumpfile({"../../", `VCD_FILEPATH});
`else
$dumpfile("wave.vcd");
`endif
$dumpvars;
end
end

// 初始化信号
rst_n = 0;
// 复位 CPU
#5; // 保持复位 25ns
rst_n = 1;
end

string testcase;
initial begin
if ($value$plusargs("TESTCASE=%s", testcase)) begin
end
end

// 检测异常
always_ff @(posedge clk) begin
if (u_CPU_TOP.exception_valid) begin
$display("%10t| [EXCEPTION] PC=0x%08h, cause=%d, tval=0x%08h", $time,
u_CPU_TOP.exception_pc, u_CPU_TOP.exception_cause, u_CPU_TOP.exception_tval);
end
end

// 超时保护
initial begin
#100000000; // 1ms 超时
$display("%10t| [EROR] | TimeOut! ", $time);
$finish;
end

// 新建一个时钟 为clk的两倍周期 便于观察
logic slow_clk;
int unsigned count;
initial slow_clk = 0;

always_ff @(posedge clk) begin
slow_clk <= ~slow_clk;
count <= count + 1;
end


endmodule

仿真测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
_end=0x00004404 stack_top=0x0000ff00 sp=0x0000fe70

2K performance run parameters for coremark.
CoreMark Size : 666
Total ticks : 4598265
Total time (secs): 229
Iterations/Sec : 0
Iterations : 10
Compiler version : GCC15.1.0
Compiler flags : -O2 -g -DPERFORMANCE_RUN=1
Memory location : STATIC
seedcrc : 0xe9f5
[0]crclist : 0xe714
[0]crcmatrix : 0x1fd7
[0]crcstate : 0x8e3a
[0]crcfinal : 0xfcaf
Correct operation validated. See README.md for run and reporting rules.
46305505000| [PASS] | Finished

和Linux主机运行结果一致。完美。