1. 背景

有的板子kernel启动的时候有ecc错, kernel里面有个polling任务会打印

EDAC MC0: 1 CE DIMM 1 rank 1 bank 4 row 65056 col 1020 on any memory ( page:0x0 offset:0x0 grain:0 syndrome:0x0)

一个workaround思路是, 检测到ecc错的时候重启板子. 因为这个ecc错和重启相关.

2. 尝试1

在init脚本里面检测

while true; do
    key="CE DIMM"
    if `dmesg | grep "$key" >/dev/null` ; then
        echo "i find:  $key"
        do_reboot "CE reset"
    fi
    key="UE DIMM"
    if `dmesg | grep "$key" >/dev/null` ; then
        echo "i find:  $key"
        do_reboot "UE reset"
    fi    

    sleep 1
done

这样做的问题是太晚了, 有的时候这个脚本根本没跑到kernel就打calltrace挂住了.

3. 尝试2

在那个polling任务里监测, 还是太晚了. 这个任务可能都还没来得及运行, kernel就挂住了.

4. 终极方案

挂中断
这里先把代码放上来在~/repo/hg/xrepo/OCTEON-SDK-3.1/linux/kernel/linux/drivers/edac/octeon_edac-lmc.c 最后加

#if 1
#include <linux/interrupt.h>
static irqreturn_t interrupt_octeon_lmc(int irq, void *dev_id)
{
    cvmx_ciu_cib_lmcx_rawx_t cib_lmc_raw;
    union cvmx_lmcx_int int_reg;
    char msg[64];
    printk("%s:%d irq=%d", __FUNCTION__, __LINE__, irq);
    cib_lmc_raw.u64 = cvmx_read_csr(CVMX_CIU_CIB_LMCX_RAWX(0,0));
    //clear the source
    cvmx_write_csr(CVMX_CIU_CIB_LMCX_RAWX(0,0), cib_lmc_raw.u64);
    int_reg.u64 = cvmx_read_csr(CVMX_LMCX_INT(0));
    printk("DDR ecc error! LMC0_INT=0x%llx", int_reg.u64);
    if (int_reg.s.sec_err || int_reg.s.ded_err) {
        union cvmx_lmcx_fadr fadr;
        fadr.u64 = cvmx_read_csr(CVMX_LMCX_FADR(0));
        snprintf(msg, sizeof(msg),
             "DIMM %d rank %d bank %d row %d col %d",
             fadr.cn61xx.fdimm, fadr.cn61xx.fbunk,
             fadr.cn61xx.fbank, fadr.cn61xx.frow, fadr.cn61xx.fcol);
    }
    if (int_reg.s.sec_err) {
        printk("single error detected: %s", msg);
        int_reg.s.sec_err = -1; /* Done, re-arm */
    }
    if (int_reg.s.ded_err) {
        printk("double error detected: %s", msg);
        int_reg.s.ded_err = -1; /* Done, re-arm */
    }
    if (int_reg.s.nxm_wr_err) {
        snprintf(msg, sizeof(msg), "NXM_WR_ERR: Write to non-existent memory");
        printk("nxm error detected: %s", msg);
        int_reg.s.nxm_wr_err = -1;  /* Done, re-arm */
    }
    //clear INT
    cvmx_write_csr(CVMX_LMCX_INT(0), int_reg.u64);
    return IRQ_HANDLED;
}
static int octeon_setup_lmc_interrupt(void)
{
    unsigned int hwirq;
    int irq;
    cvmx_ciu_cib_lmcx_enx_t cib_lmc_en;
    if (!OCTEON_IS_MODEL(OCTEON_CN70XX))
        return 0;
    printk("ECC workaround for cn70xx!");
    hwirq = (1 << 6) + 52;
    irq = irq_create_mapping(NULL, hwirq);
    printk("map irq=%d for ecc error", irq);
    if (!irq)
        return -EINVAL;
    if (request_irq(irq, interrupt_octeon_lmc,
            0, "lmc-ecc-error", interrupt_octeon_lmc)) {
        panic("request_irq(%d) failed.", irq);
    }
    cib_lmc_en.u64 = 0;
    cib_lmc_en.s.int_ded_errx = -1;
    cib_lmc_en.s.int_sec_errx = -1;
    cib_lmc_en.s.int_nxm_wr_err = -1;
    //enable the interrupt
    cvmx_write_csr(CVMX_CIU_CIB_LMCX_ENX(0,0), cib_lmc_en.u64);
    printk("attach ecc handler done!");
    return 0;
}
core_initcall(octeon_setup_lmc_interrupt);
#endif

这里面有好几个点

hwirq是硬件中断号, 和具体的中断控制器(可以认为一个中断控制器是一个domain)有关, 这个domain里面的map函数控制hwirq号到irq号的映射
irq号是linux的一个概念, 对应这个irq的软件控制实例(也就是结构体)
要挂中断, 先要把hwirq map到irq, 这通过irq_create_mapping()实现, 这个函数会申请irq相关的结构体并返回irq号
然后用request_irq()来挂中断处理函数, 最后一个参数会被传到handler里面
handler里面一般要先清中断, 可以调printk打印(为什么?)
core_initcall()是个系列函数中的一个, 可以在kernel启动的时候挂初始化函数, 位置比较靠前.

5. 启动打印

03-19 06:56:05  [    0.000000] NR_IRQS:598
03-19 06:56:05  [   34.196761] Calibrating delay loop (skipped) preset value.. 2400.00 BogoMIPS (lpj=12000000)
03-19 06:56:05  [   34.204941] pid_max: default: 32768 minimum: 501
03-19 06:56:05  [   34.209775] Mount-cache hash table entries: 256
03-19 06:56:05  [   34.217025] Checking for the daddi bug... no.
03-19 06:56:05  [   34.221223] ftrace: allocating 14160 entries in 56 pages
03-19 06:56:05  [   34.248215] Performance counters: octeon PMU enabled, 4 64-bit counters available to each CPU, irq 7
03-19 06:56:05  [   34.308272] SMP: Booting CPU01 (CoreId  1)...
03-19 06:56:05  [   34.312471] CPU revision is: 000d9602 (Cavium Octeon III)
03-19 06:56:05  [   34.312475] FPU revision is: 00739600
03-19 06:56:05  [   34.312743] Brought up 2 CPUs
03-19 06:56:05  [   34.325172] devtmpfs: initialized
03-19 06:56:05  [   34.330228] ECC workaround for cn70xx!
03-19 06:56:05  [   34.333820] map irq=122 for ecc error
03-19 06:56:05  [   34.337512] attach ecc handler done!
03-19 06:56:05  [   34.341020] interrupt_octeon_lmc:229 irq=122
03-19 06:56:05  [   34.345272] DDR ecc error! LMC0_INT=0x0
03-19 06:56:05  [   34.349093] LCM0_INT_EN=0x0 LCM0_INT=0x0
03-19 06:56:05  [   34.353002] LCM0_FADR=0x2000000 LCM0_ECC_SYND=0xc500
03-19 06:56:05  [   34.358113] NET: Registered protocol family 16
03-19 06:56:05  [   34.363522] Installing handlers for error tree at: ffffffff80727130

6. 后记

这个挂中断的方案后来也被证明还是太晚了, 最后还是因为ddr的参数不对导致的.
但是在linux下面挂一个中断处理函数, 大体如此.

octeon 增加ECC中断