reboot会概率卡死, 怀疑和cpld拉住复位有关.

和pcie有关

CIU_CIB_RST_RAW(0):0x107000000E400
CIU_CIB_RST_EN(0):0x107000000E500
/user # devmem 0x107000000E400 64
0x0000000000000000
/user # devmem 0x107000000E500 64
0x0000000000000009
0x0001180006001628:
0x0001180006001640:
/user # devmem 0x0001180006001640 64
0x0000000000000148
HOST_MODE=1
RST_LINK=0

0x0001180006001680:
0x00011800060016C0:








1. 流程

kernel/reboot.c
sys_reboot:
kernel_restart()
    kernel_restart_prepare()
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        usermodehelper_disable();
        /*对每个device调用shutdown*/
        device_shutdown();
        对每个device:
            if (dev->bus && dev->bus->shutdown) {
                if (initcall_debug)
                    dev_info(dev, "shutdown\n");
                dev->bus->shutdown(dev);
            } else if (dev->driver && dev->driver->shutdown) {
                if (initcall_debug)
                    dev_info(dev, "shutdown\n");
                dev->driver->shutdown(dev);
            }
    migrate_to_reboot_cpu()
    /*调用注册过的所有sys core回调函数*/
    syscore_shutdown()
    pr_emerg("Restarting system\n")
    kmsg_dump()
    machine_restart()
        octeon_restart()
            for_each_online_cpu(cpu)
                cvmx_write_csr(CVMX_CIU_WDOGX(cpu_logical_map(cpu)), 0);
            cvmx_write_csr(CVMX_RST_SOFT_RST, 1)

reboot会调用device_shutdown() -> static void cvm_oct_shutdown(struct platform_device *pdev) 在这个函数里面:
会关闭所有interface, 关闭GMX, rx_disable, 解注册并free掉net device, 清理hw pools, 清理pko queue.

2. 调试

/user # reboot
/user # 
 WDT=a
application_watchdog: /scripts/application_watchdog: collecting system information...
/scripts/collect_system_information: line 26: can't create /logs/info_02_uptime: Read-only file system
/scripts/collect_system_information: line 29: can't create /logs/info_ps: Read-only file system
/scripts/collect_system_information: line 34: can't create /logs/info_sp: Read-only file system
/scripts/collect_system_information: line 47: can't create /logs/info_interrupts: Read-only file system
/scripts/collect_system_information: line 50: can't create /logs/info_memory: Read-only file system
/scripts/collect_system_information: line 51: can't create /logs/info_memory: Read-only file system
/scripts/collect_system_information: line 54: can't create /logs/info_stat: Read-only file system
The system is going down NOW!
Sent SIGTERM to all processes
Sent SIGKILL to all processes
[   48.179078] reboot_helper: stored panic_counter = 0
[   48.197076] reboot_helper: isam_reboot_type='warm'
[   48.214954] reboot-helper: Enabling preserved ram
[   48.232745] flush l2 cache.
[   48.248725] reboot_helper: continuing standard linux reboot
[   48.267860] device eth0 left promiscuous mode
[   48.306015] bonding: bond0: releasing active interface eth-nta
[   48.324985] bonding: bond0: Warning: clearing HW address of bond0 while it still has VLANs.
[   48.346430] bonding: bond0: When re-adding slaves, make sure the bond's HW address matches its VLANs'.
[   48.368930] device bond0 entered promiscuous mode
[   48.456398] reboot: Restarting system
[   48.473173] reboot: Restarting system2
[   48.490019] cvmx_write_csr CVMX_RST_SOFT_RST
[   48.507409] Error: CIU_CIB_RST_RAWX(0)[INT_LINKX]
=====================BOOT=======================

现象1:

/proc # cat interrupts 
           CPU0       CPU1       CPU2       CPU3       
  8:     134755      47296      14313       8189      Core  timer
  9:          0          0          0          0       CIU  cpld0_nmi
 10:          0          0          0          0       CIU  cpld0
 11:          0          0          0          0       CIU  pioneer0
 24:       8567          0          0          0       CIU  eth0
 35:       1007          0          0          0       CIU  serial
 45:          0          0          0          0       CIU  i2c-octeon
 57:          0          0          0          0       CIU  octeon-hw-status
 59:          0          0          0          0       CIU  i2c-octeon
 60:          0          0          0          0       CIU  octeon-hw-status
 73:        973          0          0          0     CIU-W  octeon_wdt
 74:          0        973          0          0     CIU-W  octeon_wdt
 75:          0          0        973          0     CIU-W  octeon_wdt
 76:          0          0          0        973     CIU-W  octeon_wdt
 89:          0          0          0          0       CIU  octeon-hw-status
 90:          0          0          0          0       CIU  octeon-hw-status
 91:          0          0          0          0       CIU  octeon-hw-status
 92:          0          0          0          0       CIU  octeon-hw-status
 93:          0          0          0          0       CIU  octeon-hw-status
 94:          0          0          0          0       CIU  octeon-hw-status
 95:          0          0          0          0       CIU  octeon-hw-status
 96:          0          0          0          0       CIU  octeon-hw-status
 97:          0          0          0          0       CIU  octeon-hw-status
 98:          0          0          0          0       CIU  octeon-hw-status
 99:          0          0          0          0       CIU  octeon-hw-status
105:      23036      28147      20942      19105     CIU-M  SMP-IPI
109:          1          0          0          0       CIU  linux-kernel-bde
113:          0          0          0          0       CIU  MSI[0:63]
114:          0          0          0          0       CIU  MSI[64:127]
115:          0          0          0          0       CIU  MSI[127:191]
116:          0          0          0          0       CIU  MSI[192:255]
127:          0          0          0          0       CIU  octeon-hw-status
ERR:          0
/proc # devmem 0x1a000010 8 0x8b
/proc # [ 5203.567484] Data bus error, epc == 0000000010cc490c, ra == 0000000010cc4874
[ 5203.588339] Error: CIU_CIB_RST_RAWX(0)[INT_LINKX]
[ 5203.590919] [sched_delayed] sched: RT throttling activated
/proc # cat interrupts 
           CPU0       CPU1       CPU2       CPU3       
  8:     143908      50283      15649       9957      Core  timer
  9:          0          0          0          0       CIU  cpld0_nmi
 10:          0          0          0          0       CIU  cpld0
 11:          0          0          0          0       CIU  pioneer0
 24:      10078          0          0          0       CIU  eth0
 35:       1065          0          0          0       CIU  serial
 45:          0          0          0          0       CIU  i2c-octeon
 57:          0          0          0          0       CIU  octeon-hw-status
 59:          0          0          0          0       CIU  i2c-octeon
 60:          0          0          0          0       CIU  octeon-hw-status
 73:       1045          0          0          0     CIU-W  octeon_wdt
 74:          0       1045          0          0     CIU-W  octeon_wdt
 75:          0          0       1045          0     CIU-W  octeon_wdt
 76:          0          0          0       1045     CIU-W  octeon_wdt
 89:          0          0          0          0       CIU  octeon-hw-status
 90:          0          0          0          0       CIU  octeon-hw-status
 91:          0          0          0          0       CIU  octeon-hw-status
 92:          0          0          0          0       CIU  octeon-hw-status
 93:          0          0          0          0       CIU  octeon-hw-status
 94:          0          0          0          0       CIU  octeon-hw-status
 95:          0          0          0          0       CIU  octeon-hw-status
 96:          0          0          0          0       CIU  octeon-hw-status
 97:          0          0          0          0       CIU  octeon-hw-status
 98:          0          0          0          0       CIU  octeon-hw-status
 99:          0          0          0          0       CIU  octeon-hw-status
105:      24342      29600      22330      20376     CIU-M  SMP-IPI
109:          1          0          0          0       CIU  linux-kernel-bde
113:          0          0          0          0       CIU  MSI[0:63]
114:          0          0          0          0       CIU  MSI[64:127]
115:          0          0          0          0       CIU  MSI[127:191]
116:          0          0          0          0       CIU  MSI[192:255]
127:          1          0          0          0       CIU  octeon-hw-status
ERR:          0

3. Error: CIU_CIB_RST_RAWX(0)[INT_LINKX]

arch/mips/cavium-octeon/executive/cvmx-error-trees.c
error_tree_cn70xx:

在这里引用

struct cvmx_error_childbit {
    u8 valid;
    u8 bit;
    struct cvmx_error_muxchild *children;
};
struct cvmx_error_muxchild {
    u64 reg;
    u64 mask_reg;
    struct cvmx_error_regbit *bits;
    struct cvmx_error_childbit *children;
};

lmc的error项:

{1, 52 /* lmc0 */, (struct cvmx_error_muxchild[]){
    {CVMX_ADD_IO_SEG(0x00011800880001F0ull) + ((0) & 0) * 0x1000000ull /* CVMX_LMCX_INT(0) */, CVMX_ADD_IO_SEG(0x00011800880001E8ull) + ((0) & 0) * 0x1000000ull /* CVMX_LMCX_INT_EN(0) */, (struct cvmx_error_regbit[]){
            {1, 1, CVMX_ERROR_GROUP_LMC, 1, 0, "LMCX_INT(0)[SEC_ERR]"},
            {1, 1, CVMX_ERROR_GROUP_LMC, 0, 0, "LMCX_INT(0)[NXM_WR_ERR]"},
            {1, 1, CVMX_ERROR_GROUP_LMC, 5, 0, "LMCX_INT(0)[DED_ERR]"},
            {0}},
        NULL /*cvmx_error_childbit*/
    },
    {0}}},
{1, 52 /* lmc0 */, (struct cvmx_error_muxchild[]){
    {CVMX_ADD_IO_SEG(0x000107000000E200ull) /* CVMX_CIU_CIB_LMCX_RAWX(0,0) */, CVMX_ADD_IO_SEG(0x000107000000E300ull) /* CVMX_CIU_CIB_LMCX_ENX(0,0) */, (struct cvmx_error_regbit[]){
            {1, 1, CVMX_ERROR_GROUP_LMC, 1, 0, "CIU_CIB_LMCX_RAWX(0,0)[INT_SEC_ERRX]"},
            {1, 1, CVMX_ERROR_GROUP_LMC, 5, 0, "CIU_CIB_LMCX_RAWX(0,0)[INT_DED_ERRX]"},
            {1, 1, CVMX_ERROR_GROUP_LMC, 0, 0, "CIU_CIB_LMCX_RAWX(0,0)[INT_NXM_WR_ERR]"},
            {0}},
        NULL /*cvmx_error_childbit*/
    },
    {0}}},

4. 注册

struct octeon_hw_status_data {
    u64 reg;
    u32 bit;
    u8 reg_is_hwint:1;
};
arch/mips/cavium-octeon/octeon-error-tree.c //这个文件负责解析错误, 表示层
arch_initcall(octeon_error_tree_init)
    static int __init octeon_error_tree_init(void)
        struct cvmx_error_tree *tree = octeon_error_trees
        //根据芯片id, 找到cn70xx的tree
        octeon_error_tree_current = tree->tree
        //注册notifier_call, 但什么时候调用呢?见下一条
        octeon_error_tree_notifier.notifier_call = octeon_error_tree_hw_status //这个函数会遍历那个error tree的字节点, 并打印描述
        octeon_hw_status_notifier_register(&octeon_error_tree_notifier)
            raw_notifier_chain_register(&octeon_hw_status_notifiers, nb)
        octeon_error_tree_enable(CVMX_ERROR_GROUP_INTERNAL, -1)
        //??
        octeon_error_tree_enable(CVMX_ERROR_GROUP_LMC, -1)
        octeon_error_tree_enable(CVMX_ERROR_GROUP_L2C, -1)
        //是不是还要加上CVMX_ERROR_GROUP_LMC,CVMX_ERROR_GROUP_L2C ?

4.1. 上面注册的notifier_call什么时候调用?

octeon-hw-status.c //这个文件提供一个硬件状态寄存器的中断处理和发生中断时的回调机制的抽象层
//RGMII SGMII SPI和octeon-error-tree.c都会调用这个函数
sr[idx].bit = bit->bit;
sr[idx].ack_w1c = bit->w1c;
sr[idx].has_child = 0;
octeon_hw_status_add_source(sr)
    root->hwint == chain->reg//遍历root, 寻找和入参一样的chain->reg
    root->irq = irq_create_mapping(NULL, root->hwint)
    rv = request_threaded_irq(root->irq, NULL,octeon_hw_status_irq, IRQF_ONESHOT,"octeon-hw-status", root)
//这个应该就是中断处理函数了
octeon_hw_status_irq()    
    visit_leaves(root, false, irq_cb, &d);
        irq_cb()
            raw_notifier_call_chain(&octeon_hw_status_notifiers, OCTEON_HW_STATUS_SOURCE_ASSERTED, &ohsd)

4.2. 猜想, 调用int octeon_hw_status_disable(u64 reg, u64 bit_mask)关闭中断位就可以了.

5. octeon_error_tree_enable()函数

    while(对每个项和子项)
        irq_reg = octeon_error_tree_map_irq_reg(base->reg);
        sr[0].reg = irq_reg << 6 | line->bit;
        sr[0].reg_is_hwint = 1;
        sr[0].has_child = 1;
        octeon_error_tree_add(sr, 1, ARRAY_SIZE(sr) - 1, child, group, unit);
            octeon_hw_status_add_source()
                root->irq = irq_create_mapping(NULL, root->hwint);
                //注册中断, octeon_hw_status_irq()是在任务上下文调用的, 
                rv = request_threaded_irq(root->irq, NULL,octeon_hw_status_irq, IRQF_ONESHOT,"octeon-hw-status", root);
                    visit_leaves(root, false, irq_cb, &d);
                        raw_notifier_call_chain(&octeon_hw_status_notifiers,OCTEON_HW_STATUS_SOURCE_ASSERTED, &ohsd); //这就是前面注册的
            octeon_hw_status_enable()
            //递归调字节点的

注:request_threaded_irq()第二个入参是中断handler, 在中断上下文调用. 为NULL的话使用下面默认的irq_default_primary_handler

/*
 * Default primary interrupt handler for threaded interrupts. Is
 * assigned as primary handler when request_threaded_irq is called
 * with handler == NULL. Useful for oneshot interrupts.
 */
static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
{
    return IRQ_WAKE_THREAD;
}

6. ddr ecc中断寄存器


# devmem 0x00011800880001E8 64
0x0000000000000000
# devmem 0x00011800880001F0 64
0x0000000000000000
# devmem 0x00011800880001D8 64
0x000000000AFFB398
# devmem 0x107000000E000 64
0x0000000000000000
# devmem 0x107000000E100 64
0x00000000007FFFFF
CIU_CIB_LMC(0)_RAW(0)
# devmem 0x107000000E200 64
0x0000000000000008
CIU_CIB_LMC(0)_EN(0)
# devmem 0x107000000E300 64
0x0000000000000023
# devmem 0x107000000E200 64
0x0000000000000008
# devmem 0x107000000E300 64 0xffff
[ 1026.792931] ERROR: CIB bit 3@800107000000e200 IRQ unhandled, disabling
# cat /proc/interrupts 
           CPU0       CPU1       CPU2       CPU3       
  8:     111448     111454     111460     111423      Core  timer
 25:        146          0          0          0       CIB  ahci
 26:          0          0          0          0       CIB  xhci-hcd:usb1
 27:          0          0          0          0       CIB  xhci-hcd:usb3
 34:        696          0          0          0       CIU  serial
 45:          8          0          0          0       CIU  i2c-octeon
 57:          0          0          0          0       CIU  octeon-hw-status
 59:          0          0          0          0       CIU  i2c-octeon
 60:          0          0          0          0       CIU  octeon-hw-status
 73:        221          0          0          0     CIU-W  octeon_wdt
 74:          0        221          0          0     CIU-W  octeon_wdt
 75:          0          0        221          0     CIU-W  octeon_wdt
 76:          0          0          0        221     CIU-W  octeon_wdt
 89:          0          0          0          0       CIU  cib
 90:          0          0          0          0       CIU  cib
 91:          0          0          0          0       CIU  octeon-hw-status
 92:          0          0          0          0       CIU  octeon-hw-status
 93:          0          0          0          0       CIU  octeon-hw-status
 94:          0          0          0          0       CIU  octeon-hw-status
 95:          0          0          0          0       CIU  octeon-hw-status
 96:          0          0          0          0       CIU  octeon-hw-status
 97:          0          0          0          0       CIU  cib
 98:          0          0          0          0       CIU  octeon-hw-status
 99:          0          0          0          0       CIU  octeon-hw-status
100:         13          0          0          0       CIU  octeon_mmc
105:       2087       2308        739       1575     CIU-M  SMP-IPI
113:          0          0          0          0       CIU  MSI[0:63]
114:          0          0          0          0       CIU  MSI[64:127]
115:          0          0          0          0       CIU  MSI[127:191]
116:          0          0          0          0       CIU  MSI[192:255]
121:    1108312          0          0          0       CIU  oct_ilm
122:          1          0          0          0       CIU  cib
127:          0          0          0          0       CIU  cib
144:        146          0          0          0       CIU  cib
145:          0          0          0          0       CIU  cib
ERR:          0
static irqreturn_t octeon_irq_cib_handler(int my_irq, void *data)
# [  366.867126] INFO: rcu_sched self-detected stall on CPU { 0}  (t=6000 jiffies g=18446744073709551430 c=18446744073709551429 q=15)                                [5/3476]
[  366.937276] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.20-rt14-Cavium-Octeon #6
[  366.956650] Stack : ffffffff80980000 ffffffff812a0000 0000000000000057 ffffffff80980000
          000000000000000f ffffffff81290000 ffffffff80980000 0000000000000001
          000000000000000f ffffffff81290000 ffffffff80980000 0000000000000001
          0000000000000000 ffffffff8016e9e8 0000000000000000 0000000000000000
          ffffffff812a0000 ffffffff81290000 ffffffff80862c80 ffffffff809403f7
          ffffffff81289d00 ffffffff80940900 0000000000000000 0000000000000000
          8000000002270c58 ffffffff80940000 ffffffff80940000 ffffffff806dfbc8
          ffffffff8090afa8 ffffffff8090aea0 ffffffff8093a4e8 ffffffff801e3ec8
          ffffffff809404f0 ffffffff80862c80 0000000000000000 0000000000000000
          0000000000000000 ffffffff80149390 0000000000000000 0000000000000000
          ...
[  367.642775] Call Trace:
[  367.656940] [<ffffffff80149390>] show_stack+0xc0/0xe0
[  367.673711] [<ffffffff801e3ec8>] rcu_check_callbacks+0x3a8/0x838
[  367.691434] [<ffffffff801801ec>] update_process_times+0x54/0x88
[  367.709070] [<ffffffff801b9f90>] tick_sched_timer+0x70/0x180
[  367.726447] [<ffffffff80199cd8>] __run_hrtimer+0xa0/0x228
[  367.743561] [<ffffffff8019aaa8>] hrtimer_interrupt+0x2c8/0x3c0
[  367.761110] [<ffffffff8014c058>] c0_compare_interrupt+0x68/0x98
[  367.778749] [<ffffffff801d93b0>] handle_irq_event_percpu+0x80/0x2c0
[  367.796732] [<ffffffff801dd6e8>] handle_percpu_irq+0x98/0xc8
[  367.814106] [<ffffffff801d8924>] generic_handle_irq+0x44/0x58
[  367.831570] [<ffffffff806e7fac>] do_IRQ+0x24/0x30
[  367.847990] [<ffffffff80106a70>] plat_irq_dispatch+0xa0/0xc0
[  367.865364] [<ffffffff801440cc>] ret_from_irq+0x0/0x4
[  367.882130] [<ffffffff8019c5dc>] notifier_call_chain+0x8c/0xc0
[  367.899678] [<ffffffff8019c62c>] __atomic_notifier_call_chain+0x1c/0x28
[  367.918008] [<ffffffff8019cfc4>] notify_die+0x34/0x40
[  367.934774] [<ffffffff80149f90>] do_ri+0x60/0x280
[  367.951192] [<ffffffff801440c0>] ret_from_exception+0x0/0xc
[  367.968479] [<ffffffff8014b4f8>] do_ade+0x40/0x798
[  367.984983] [<ffffffff801440c0>] ret_from_exception+0x0/0xc
[  368.002269] [<ffffffff801d93a0>] handle_irq_event_percpu+0x70/0x2c0
[  368.020252] [<ffffffff801dd6e8>] handle_percpu_irq+0x98/0xc8
[  368.037626] [<ffffffff80103be0>] octeon_irq_cib_handler+0xf0/0x1c8
[  368.055522] [<ffffffff801d93b0>] handle_irq_event_percpu+0x80/0x2c0
[  368.073505] [<ffffffff801d9650>] handle_irq_event+0x60/0x90
[  368.090791] [<ffffffff801dce10>] handle_level_irq+0xd0/0x150
[  368.108165] [<ffffffff801d8924>] generic_handle_irq+0x44/0x58 很多打印
[  368.125626] [<ffffffff806e7fac>] do_IRQ+0x24/0x30
[  368.142044] [<ffffffff80106a3c>] plat_irq_dispatch+0x6c/0xc0
[  368.159417] [<ffffffff801440cc>] ret_from_irq+0x0/0x4
[  368.176181] [<ffffffff801443e0>] __r4k_wait+0x20/0x40
[  368.192950] [<ffffffff801b16c4>] cpu_startup_entry+0xe4/0x2b0
[  368.210415] [<ffffffff809de9c8>] start_kernel+0x4b0/0x4d0
[  368.227527]

results matching ""

    No results matching ""