从零开始学RISC：第十四篇

CoreMark移植

功能验证和CSR都做完了，该跑个分了。

CoreMark 是由 EEMBC 推出的一个处理器基准测试（benchmark），主要用来衡量 嵌入式处理器、MCU 和 CPU 内核的整数运算性能。它的设计目标是尽量只测试“处理器核心本身”的能力，而不过多受操作系统、外设或平台差异影响，因此很常被用来做不同芯片之间的横向比较。

源代码获取

源代码可以在eembc/coremark获取。

目录结构

CoreMark的库底下有很多文件。我们只需要关心barebones文件夹下的内容即可，这是适用于裸机的移植方案。

├── barebones             # barebones 移植
│  │                      # 这两个文件需要修改
│  ├── core_portme.c      # coremark 的平台移植层实现模板
│  ├── core_portme.h      # 核心可移植层头文件
│  │
│  ├── core_portme.mak    # Makefile
│  ├── cvt.c              # 用于浮点数到字符串转换的实现 可不需要
│  └── ee_printf.c        # printf 格式化实现
│
├── barebones_porting.md  # barebones移植说明
│
├── core_list_join.c      # 以下六个为 coremark 核心代码
├── core_main.c           # 原则上不需要修改
├── core_matrix.c         # 修改也禁止修改算法相关
├── core_state.c          # 需保证计算后的校验值一致
├── core_util.c           # 
├── coremark.h            # 
│
├── Makefile              # Makefile
└── README.md             # 移植说明

soft_div.c

我们的CPU只支持zmmul，仅支持硬件乘法而不支持硬件除法。CoreMark 的运算需要用到除法，因此我们要加上软件除法和取模实现：

/*
 * 软件除法和取模实现
 * 适用于只有硬件乘法的 RISC-V CPU
 */

// 32位无符号除法（长除法算法）
unsigned int __udivsi3(unsigned int dividend, unsigned int divisor)
{
    unsigned int quotient  = 0;
    unsigned int remainder = dividend;
    int bit;

    if (divisor == 0)
        return 0;  // 除零保护

    // 找到最高位
    for (bit = 31; bit >= 0; bit--)
    {
        // 防止左移溢出：只有当 divisor <= (0xFFFFFFFF >> bit) 时才进行移位比较
        // 否则 divisor << bit 会溢出并产生错误结果
        if (divisor <= (0xFFFFFFFFU >> bit))
        {
            unsigned int shifted = divisor << bit;
            if (remainder >= shifted)
            {
                remainder -= shifted;
                quotient  |= (1U << bit);
            }
        }
    }

    return quotient;
}

// 32位无符号取模
unsigned int __umodsi3(unsigned int dividend, unsigned int divisor)
{
    unsigned int remainder = dividend;
    int bit;

    if (divisor == 0)
        return dividend;  // 除零保护

    for (bit = 31; bit >= 0; bit--)
    {
        // 防止左移溢出：只有当 divisor <= (0xFFFFFFFF >> bit) 时才进行移位比较
        if (divisor <= (0xFFFFFFFFU >> bit))
        {
            unsigned int shifted = divisor << bit;
            if (remainder >= shifted)
            {
                remainder -= shifted;
            }
        }
    }

    return remainder;
}

// 32位有符号除法
int __divsi3(int dividend, int divisor)
{
    int negative = 0;
    unsigned int result;

    // 处理符号
    if (dividend < 0)
    {
        dividend = -dividend;
        negative = !negative;
    }
    if (divisor < 0)
    {
        divisor  = -divisor;
        negative = !negative;
    }

    result = __udivsi3((unsigned int)dividend, (unsigned int)divisor);

    return negative ? -(int)result : (int)result;
}

// 32位有符号取模
int __modsi3(int dividend, int divisor)
{
    int sign = (dividend < 0) ? -1 : 1;
    unsigned int result;

    if (dividend < 0)
        dividend = -dividend;
    if (divisor < 0)
        divisor = -divisor;

    result = __umodsi3((unsigned int)dividend, (unsigned int)divisor);

    return sign * (int)result;
}

// 64位无符号除法（长除法算法）
unsigned long long __udivdi3(unsigned long long dividend, unsigned long long divisor)
{
    unsigned long long quotient  = 0;
    unsigned long long remainder = dividend;
    int bit;

    if (divisor == 0)
        return 0;  // 除零保护

    // 找到最高位
    for (bit = 63; bit >= 0; bit--)
    {
        // 防止左移溢出
        if (divisor <= (0xFFFFFFFFFFFFFFFFULL >> bit))
        {
            unsigned long long shifted = divisor << bit;
            if (remainder >= shifted)
            {
                remainder -= shifted;
                quotient  |= (1ULL << bit);
            }
        }
    }

    return quotient;
}

// 64位无符号取模
unsigned long long __umoddi3(unsigned long long dividend, unsigned long long divisor)
{
    unsigned long long remainder = dividend;
    int bit;

    if (divisor == 0)
        return dividend;  // 除零保护

    for (bit = 63; bit >= 0; bit--)
    {
        if (divisor <= (0xFFFFFFFFFFFFFFFFULL >> bit))
        {
            unsigned long long shifted = divisor << bit;
            if (remainder >= shifted)
            {
                remainder -= shifted;
            }
        }
    }

    return remainder;
}

// 64位有符号除法
long long __divdi3(long long dividend, long long divisor)
{
    int negative = 0;
    unsigned long long result;

    if (dividend < 0)
    {
        dividend = -dividend;
        negative = !negative;
    }
    if (divisor < 0)
    {
        divisor  = -divisor;
        negative = !negative;
    }

    result = __udivdi3((unsigned long long)dividend, (unsigned long long)divisor);

    return negative ? -(long long)result : (long long)result;
}

// 64位有符号取模
long long __moddi3(long long dividend, long long divisor)
{
    int sign = (dividend < 0) ? -1 : 1;
    unsigned long long result;

    if (dividend < 0)
        dividend = -dividend;
    if (divisor < 0)
        divisor = -divisor;

    result = __umoddi3((unsigned long long)dividend, (unsigned long long)divisor);

    return sign * (long long)result;
}

core_portme.h

这个文件是核心可移植层的头文件，定义了平台能力宏、基本数据类型别名、定时/计时类型等关键参数。

我们要在最开始加上#include <stddef.h>。

此外，因为我们是RV32I，不支持浮点数，要将HAS_FLOAT设置为0；同理，我们不使用time.h库，全靠裸机的寄存器读取来判断当前时间，因此设置HAS_TIME_H为0。

为了方便后续操作，再新定义一个数据类型ee_u64，和long long一致。

最后，我们是裸机直接运行，用iverilog进行仿真，因此无法传入参数，需要将MAIN_HAS_NOARGC设置为1。

core_portme.c

这个文件是 coremark 的平台移植层实现模板，需要修改定时接口等基本框架。

别忘了上面的软件除法：

/* 外部声明软件除法函数 */
extern unsigned int __udivsi3(unsigned int, unsigned int);
extern unsigned int __umodsi3(unsigned int, unsigned int);
/* 64位除法函数声明 */
extern unsigned long long __udivdi3(unsigned long long, unsigned long long);
extern unsigned long long __umoddi3(unsigned long long, unsigned long long);

然后，根据我们的时间计数器MCYCLE和MCYCLEH，自行定义获取时间计数的函数，并将其绑定到barebones_clock内：

/* Read mcycle CSR (cycle counter) - returns full 64 bits
 * Uses atomic read sequence to handle overflow between mcycleh and mcycle reads.
 * mcycle:  0xB00 (low 32 bits)
 * mcycleh: 0xB80 (high 32 bits)
 */
static inline ee_u64 read_mcycle64(void)
{
    ee_u32 hi1, lo, hi2;
    /* Atomic read sequence: read high, low, high again.
     * If high changed, low overflowed, so retry. */
    do
    {
        __asm__ volatile("csrr %0, mcycleh" : "=r"(hi1));
        __asm__ volatile("csrr %0, mcycle" : "=r"(lo));
        __asm__ volatile("csrr %0, mcycleh" : "=r"(hi2));
    } while (hi1 != hi2);
    return ((ee_u64)hi1 << 32) | (ee_u64)lo;
}

CORETIMETYPE // 使用 ee_u64 数据类型 因寄存器拼合为64位
barebones_clock()
{
    return read_mcycle64();
}

接着，设置秒数计算的函数time_in_secs：

secs_ret
time_in_secs(CORE_TICKS ticks)
{
    /* Perform 64-bit division, then cast result to secs_ret (ee_u32).
     * Since seconds is a small number, 32-bit result is sufficient. */
    ee_u64 secs_64  = ticks / (ee_u64)EE_TICKS_PER_SEC;
    secs_ret retval = (secs_ret)secs_64;
    return retval;
}

ee_printf.c

我们需要使用iverilog进行运行，并输出。怎么做呢？我们可以划分一个独特的内存地址，并进行监控，当该内存地址写入值之后，将值打印出来。这就是WRITE_TOHOST的监控方式。

在开头设定：#define TOHOST (*(volatile ee_u32*)0x0D000720)

然后内部的HAS_FLOAT相关内容都可以删去。

之后，我们重新定义字符发送函数，和ee_printf函数：

#include <stdint.h>

static inline void putch(char c)
{
    TOHOST = (uint32_t)(uint8_t)c;
}

static void print_hex32(uint32_t v, int width)
{
    static const char* hex = "0123456789abcdef";
    if (width <= 0)
        width = 8;
    for (int i = (width - 1) * 4; i >= 0; i -= 4)
        putch(hex[(v >> i) & 0xF]);
}

static void print_udec32(uint32_t x)
{
    char buf[16];
    int i = 0;

    if (x == 0)
    {
        putch('0');
        return;
    }

    while (x)
    {
        buf[i++]  = '0' + (x % 10);
        x        /= 10;
    }
    while (i--) putch(buf[i]);
}

static void print_dec32(int32_t v)
{
    if (v < 0)
    {
        putch('-');
        print_udec32((uint32_t)(-v));
    }
    else
    {
        print_udec32((uint32_t)v);
    }
}

int ee_printf(const char* fmt, ...)
{
    va_list ap;
    va_start(ap, fmt);

    while (*fmt)
    {
        if (*fmt != '%')
        {
            putch(*fmt++);
            continue;
        }

        fmt++;  // skip '%'

        // "%%"
        if (*fmt == '%')
        {
            putch('%');
            fmt++;
            continue;
        }

        // 解析可选 '0'
        int zero_pad = 0;
        if (*fmt == '0')
        {
            zero_pad = 1;
            fmt++;
        }

        // 解析宽度
        int width = 0;
        while (*fmt >= '0' && *fmt <= '9')
        {
            width = width * 10 + (*fmt - '0');
            fmt++;
        }

        // ✅ 解析长度修饰符：只支持 'l'
        int is_long = 0;
        if (*fmt == 'l')
        {
            is_long = 1;
            fmt++;
        }

        char f = *fmt++;

        switch (f)
        {
        case 'c':
        {
            char c = (char)va_arg(ap, int);
            putch(c);
            break;
        }
        case 's':
        {
            const char* s = va_arg(ap, const char*);
            if (!s)
                s = "(null)";
            while (*s) putch(*s++);
            break;
        }
        case 'x':
        {
            uint32_t v;
            if (is_long)
                v = (uint32_t)va_arg(ap, unsigned long);  // RV32: 32-bit
            else
                v = (uint32_t)va_arg(ap, unsigned int);

            if (width == 0)
                width = 8;
            print_hex32(v, width);
            break;
        }
        case 'u':
        {
            uint32_t v;
            if (is_long)
                v = (uint32_t)va_arg(ap, unsigned long);
            else
                v = (uint32_t)va_arg(ap, unsigned int);

            print_udec32(v);
            break;
        }
        case 'd':
        {
            int32_t v;
            if (is_long)
                v = (int32_t)va_arg(ap, long);
            else
                v = (int32_t)va_arg(ap, int);

            print_dec32(v);
            break;
        }
        default:
            // unknown specifier
            putch('%');
            if (is_long)
                putch('l');
            putch(f);
            break;
        }
    }

    va_end(ap);
    return 0;
}

至此，输出部分修改完毕。

core_main.c

这里就是关键的主入口了。我们需要修改一部分函数。

在开头加上自定义的运行完毕返回函数：

#define TOHOST (*(volatile ee_u32*)0x0D000720)
static void exit_sim(ee_u32 code)
{
    TOHOST = code;  // code=1 => PASS
    while (1)
    {
    }  // 保险起见，避免继续跑
}

随后继续加上read_mcycle64：

static inline ee_u64 read_mcycle64(void)
{
    ee_u32 hi1, lo, hi2;
    /* Atomic read sequence: read high, low, high again.
     * If high changed, low overflowed, so retry. */
    do
    {
        __asm__ volatile("csrr %0, mcycleh" : "=r"(hi1));
        __asm__ volatile("csrr %0, mcycle" : "=r"(lo));
        __asm__ volatile("csrr %0, mcycleh" : "=r"(hi2));
    } while (hi1 != hi2);
    return ((ee_u64)hi1 << 32) | (ee_u64)lo;
}

关键的计算校验值部分在282行开始的地方：

stop_time();
total_time = get_time();
/* get a function of the input to report */
seedcrc    = crc16(results[0].seed1, seedcrc);
seedcrc    = crc16(results[0].seed2, seedcrc);
seedcrc    = crc16(results[0].seed3, seedcrc);
seedcrc    = crc16(results[0].size, seedcrc);

这里，如果不在每个crc更新之间插入一个ee_printf，会导致最终的校验值不正常，非常奇怪。原因尚不明确。

seedcrc    = crc16(results[0].seed1, seedcrc);
ee_printf(""); // 手动插入空的 ee_printf
seedcrc = crc16(results[0].seed2, seedcrc);
ee_printf("");
seedcrc = crc16(results[0].seed3, seedcrc);
ee_printf("");
seedcrc = crc16(results[0].size, seedcrc);

因为我们是iverilog仿真，不需要跑很多轮，因此可以将367行的total_errors++注释掉。

最后，需要在415行开始的程序运行结束部分加入自己的运行完毕返回函数：

if (total_errors > 0)
{
    ee_printf("Errors detected\n");
    exit_sim(2);
}
if (total_errors < 0)
{
    ee_printf(
        "Cannot validate operation for these seed values, please compare "
        "with results on a known platform.\n");
    exit_sim(2);
}

#if (MEM_METHOD == MEM_MALLOC)
for (i = 0; i < MULTITHREAD; i++)
    portable_free(results[i].memblock[0]);
#endif
/* And last call any target specific code for finalizing */
portable_fini(&(results[0].port));
exit_sim(1);
return MAIN_RETURN_VAL;

至此，CoreMark库修改完毕。

编译与链接相关

start.S

我们编译需要start.S作为link.ld的入口。

    .section .text.init
    .globl   _start
_start:
    la       sp, _stack_top

# 清 BSS（如果你没定义 __bss_start/__bss_end 就先不做）
# la t0, __bss_start
# la t1, __bss_end
# ...

    # call     print_test
    call     main
1:
    j        1b

link.ld

链接脚本link.ld用于告诉链接器最终生成的程序该怎么排布到内存里。

OUTPUT_ARCH("riscv")
ENTRY(_start)

SECTIONS
{
  /* 你的 CPU reset 从 0 开始取指 */
  . = 0x00000000;

  /* 启动入口，放在最前面 */
  .text.init : {
    *(.text.init)
  }

  /* 不要再 ALIGN(0x1000)！否则 .text 会被推到 0x1000 */
  .text : {
    *(.text .text.*)
    *(.rodata .rodata.*)
  }

  .data : {
    *(.data .data.*)
  }

  .bss : {
    *(.bss .bss.*)
    *(COMMON)
  }

  _stack_top = 0x0000FF00;
  _end = .;
}

ASSERT(_end < _stack_top - 0x1000, "ERROR: program too large / stack overlap!");

core_portme.mak

自己写一个：

# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 
# Original Author: Shay Gal-on

#File : core_portme.mak

# Flag : OUTFLAG
#	Use this flag to define how to to get an executable (e.g -o)
OUTFLAG= -o
# Flag : CC
#	Use this flag to define compiler to use
CC 		= gcc
# Flag : LD
#	Use this flag to define compiler to use
LD		= gld
# Flag : AS
#	Use this flag to define compiler to use
AS		= gas
# Flag : CFLAGS
#	Use this flag to define compiler options. Note, you can add compiler options from the command line using XCFLAGS="other flags"
PORT_CFLAGS = -O0 -g
FLAGS_STR = "$(PORT_CFLAGS) $(XCFLAGS) $(XLFLAGS) $(LFLAGS_END)"
CFLAGS = $(PORT_CFLAGS) -I$(PORT_DIR) -I. -DFLAGS_STR=\"$(FLAGS_STR)\" 
#Flag : LFLAGS_END
#	Define any libraries needed for linking or other flags that should come at the end of the link line (e.g. linker scripts). 
#	Note : On certain platforms, the default clock_gettime implementation is supported but requires linking of librt.
SEPARATE_COMPILE=1
# Flag : SEPARATE_COMPILE
# You must also define below how to create an object file, and how to link.
OBJOUT 	= -o
LFLAGS 	= 
ASFLAGS =
OFLAG 	= -o
COUT 	= -c

LFLAGS_END = 
# Flag : PORT_SRCS
# 	Port specific source files can be added here
#	You may also need cvt.c if the fcvt functions are not provided as intrinsics by your compiler!
PORT_SRCS = $(PORT_DIR)/core_portme.c $(PORT_DIR)/ee_printf.c
vpath %.c $(PORT_DIR)
vpath %.s $(PORT_DIR)

# Flag : LOAD
#	For a simple port, we assume self hosted compile and run, no load needed.

# Flag : RUN
#	For a simple port, we assume self hosted compile and run, simple invocation of the executable

LOAD = echo "Please set LOAD to the process of loading the executable to the flash"
RUN = echo "Please set LOAD to the process of running the executable (e.g. via jtag, or board reset)"

OEXT = .o
EXE = .bin

$(OPATH)$(PORT_DIR)/%$(OEXT) : %.c
	$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@

$(OPATH)%$(OEXT) : %.c
	$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@

$(OPATH)$(PORT_DIR)/%$(OEXT) : %.s
	$(AS) $(ASFLAGS) $< $(OBJOUT) $@

# Target : port_pre% and port_post%
# For the purpose of this simple port, no pre or post steps needed.

.PHONY : port_prebuild port_postbuild port_prerun port_postrun port_preload port_postload
port_pre% port_post% : 

# FLAG : OPATH
# Path to the output folder. Default - current folder.
OPATH = ./
MKDIR = mkdir -p

顶层模块

我们需要写一个针对CoreMark测试的顶层模块。

`timescale 1ns / 1ps

`include "../src/CPU_TOP.sv"
`define DEBUG 

`define REG_FILE u_CPU_TOP.u_registerf
// verilog_format: off
module coremark;

// 时钟和复位信号
    logic        clk;
    logic        rst_n;

    // IROM 信号
    logic [13:0] irom_addr;
    logic [31:0] irom_data;

// 实例化 IROM (指令存储器)
    IROM #(
        .ADDR_WIDTH(14)
    ) u_IROM (
        .a  (irom_addr),
        .spo(irom_data)
    );

// 实例化 CPU_TOP
    CPU_TOP u_CPU_TOP (
        .clk  (clk),
        .rst_n(rst_n),
        .instr(irom_data),
        .pc   (irom_addr)
    );

// 寄存器堆监控信号
    logic [31:0] x0,  x1,  x2,  x3,  x4,  x5,  x6,  x7,
                 x8,  x9,  x10, x11, x12, x13, x14, x15,
                 x16, x17, x18, x19, x20, x21, x22, x23,
                 x24, x25, x26, x27, x28, x29, x30, x31;

    always_comb begin
        x0  = `REG_FILE.rf_in[0];
        x1  = `REG_FILE.rf_in[1];
        x2  = `REG_FILE.rf_in[2];
        x3  = `REG_FILE.rf_in[3];
        x4  = `REG_FILE.rf_in[4];
        x5  = `REG_FILE.rf_in[5];
        x6  = `REG_FILE.rf_in[6];
        x7  = `REG_FILE.rf_in[7];
        x8  = `REG_FILE.rf_in[8];
        x9  = `REG_FILE.rf_in[9];
        x10 = `REG_FILE.rf_in[10];
        x11 = `REG_FILE.rf_in[11];
        x12 = `REG_FILE.rf_in[12];
        x13 = `REG_FILE.rf_in[13];
        x14 = `REG_FILE.rf_in[14];
        x15 = `REG_FILE.rf_in[15];
        x16 = `REG_FILE.rf_in[16];
        x17 = `REG_FILE.rf_in[17];
        x18 = `REG_FILE.rf_in[18];
        x19 = `REG_FILE.rf_in[19];
        x20 = `REG_FILE.rf_in[20];
        x21 = `REG_FILE.rf_in[21];
        x22 = `REG_FILE.rf_in[22];
        x23 = `REG_FILE.rf_in[23];
        x24 = `REG_FILE.rf_in[24];
        x25 = `REG_FILE.rf_in[25];
        x26 = `REG_FILE.rf_in[26];
        x27 = `REG_FILE.rf_in[27];
        x28 = `REG_FILE.rf_in[28];
        x29 = `REG_FILE.rf_in[29];
        x30 = `REG_FILE.rf_in[30];
        x31 = `REG_FILE.rf_in[31];
    end

// 时钟生成 (100MHz, 周期 10ns)、
    // verilog_format: on
    initial begin
        clk = 0;
        forever #5 clk = ~clk;
    end


    localparam integer TOHOST_ADDR = 32'h0d000720;  // 注意：根据你的链接脚本调整！

    always_ff @(posedge clk) begin
        if (u_CPU_TOP.dram_we_MEM && u_CPU_TOP.alu_result_MEM == TOHOST_ADDR) begin
            // ✅ 读取要写入 tohost 的数据
            logic [31:0] tohost_data;
            tohost_data = u_CPU_TOP.rf_rd2_MEM;

            // 退出码判断
            if (tohost_data == 32'd1) begin
                $display("%10t| [PASS] |  Finished  ", $time);
                $finish;
            end else if (tohost_data == 32'd2) begin
                $display("%10t| [FAIL] |  Finished  ", $time);
                $finish;
            end else begin
                // 普通字符输出
                $write("%c", tohost_data[7:0]);
                $fflush();
            end
        end
    end

    // 复位和测试控制
    initial begin
        // 波形文件设置
        integer dumpwave;
        if ($value$plusargs("DUMPWAVE=%d", dumpwave)) begin
            if (dumpwave == 1) begin
`ifdef VCD_FILEPATH
                $dumpfile({"../../", `VCD_FILEPATH});
`else
                $dumpfile("wave.vcd");
`endif
                $dumpvars;
            end
        end

        // 初始化信号
        rst_n = 0;
        // 复位 CPU
        #5;  // 保持复位 25ns
        rst_n = 1;
    end

    string testcase;
    initial begin
        if ($value$plusargs("TESTCASE=%s", testcase)) begin
        end
    end

    // 检测异常
    always_ff @(posedge clk) begin
        if (u_CPU_TOP.exception_valid) begin
            $display("%10t| [EXCEPTION] PC=0x%08h, cause=%d, tval=0x%08h", $time,
                     u_CPU_TOP.exception_pc, u_CPU_TOP.exception_cause, u_CPU_TOP.exception_tval);
        end
    end

    // 超时保护
    initial begin
        #100000000;  // 1ms 超时
        $display("%10t| [EROR] |  TimeOut!  ", $time);
        $finish;
    end

    // 新建一个时钟 为clk的两倍周期 便于观察
    logic        slow_clk;
    int unsigned count;
    initial slow_clk = 0;

    always_ff @(posedge clk) begin
        slow_clk <= ~slow_clk;
        count    <= count + 1;
    end


endmodule

仿真测试

_end=0x00004404 stack_top=0x0000ff00 sp=0x0000fe70

2K performance run parameters for coremark.
CoreMark Size    : 666
Total ticks      : 4598265
Total time (secs): 229
Iterations/Sec   : 0
Iterations       : 10
Compiler version : GCC15.1.0
Compiler flags   : -O2 -g -DPERFORMANCE_RUN=1  
Memory location  : STATIC
seedcrc          : 0xe9f5
[0]crclist       : 0xe714
[0]crcmatrix     : 0x1fd7
[0]crcstate      : 0x8e3a
[0]crcfinal      : 0xfcaf
Correct operation validated. See README.md for run and reporting rules.
46305505000| [PASS] |  Finished

和Linux主机运行结果一致。完美。