在3.2.0内核上,将AM33X平台的CPSW配置成工作在双网口模式,两个网口都外接百兆PHY,在linux系统中可以看到eth0和eth1两个网卡,
利用brctl工具将这两个网卡配置为网桥模式,然后使用网络测试仪做双向90M流量数据包转发测试时可长时间稳定运行,无任何问题。
但是如果将双向数据包流量增加到100M时,运行10几分钟后linux系统会打印如下错误:
root@SDT-route:~# [166259.673790] ————[ cut here ]————
[166269.877097] WARNING: at net/sched/sch_generic.c:255 dev_watchdog+0x140/0x228()
[166285.572713] NETDEV WATCHDOG: eth1 (cpsw): transmit queue 0 timed out
[166299.355497] Modules linked in: rkey(O) option usb_wwan
[166310.590825] Backtrace:[166316.017001] [<c0016298>] (c0016298) from [<c029d4c0>] (c029d4c0)
[166329.071635] r6:c0386195 r5:000000ff r4:c09dbe38 r3:60000113
[166341.399657] [<c029d4a8>] (c029d4a8) from [<c003738c>] (c003738c)
[166354.453267] [<c0037338>] (c0037338) from [<c0037448>] (c0037448)
[166367.502526] r8:c02128cc r7:00000100 r6:00000000 r5:c0a62a50 r4:c66f7000
[166381.688083] r3:00000009
[166387.495443] [<c0037410>] (c0037410) from [<c0212a0c>] (c0212a0c)
[166400.532926] r3:c66f7000 r2:c03861ad
[166408.391017] [<c02128cc>] (c02128cc) from [<c00413ac>] (c00413ac)
[166421.428158] r6:00000100 r5:c09da000 r4:c0a37980
[166431.527358] [<c0041278>] (c0041278) from [<c003c438>] (c003c438)
[166444.580627] [<c003c3b8>] (c003c3b8) from [<c003c82c>] (c003c82c)
[166457.621694] [<c003c7e4>] (c003c7e4) from [<c0013b90>] (c0013b90)
[166470.674707] [<c0013b24>] (c0013b24) from [<c00085b0>] (c00085b0)
[166483.712105] r5:00000004 r4:0000005d
[166491.569939] [<c000854c>] (c000854c) from [<c00128c0>] (c00128c0)
[166504.611091] Exception stack(0xc09dbf40 to 0xc09dbf88)
[166515.629161] bf40: 40000013 c09fe188 c09dbf88 00000000 c09da000 c09e2b08 c0a24644 c09e2afc
[166533.342398] bf60: 80004059 413fc082 00000000 c09dbf94 c09dbf98 c09dbf88 c0013ce0 c0013ce4
[166551.047187] bf80: 60000013 ffffffff
[166558.689470] r6:ffffffff r5:60000013 r4:c0013ce4 r3:c0013ce0
[166571.029523] [<c0013cb8>] (c0013cb8) from [<c0013e3c>] (c0013e3c)
[166584.070505] [<c0013dec>] (c0013dec) from [<c029738c>] (c029738c)
[166597.120361] r6:c0b65500 r5:c03d3aac r4:c09dc0bc r3:00000000
[166609.461609] [<c029732c>] (c029732c) from [<c03b0778>] (c03b0778)
[166622.502505] [<c03b04f8>] (c03b04f8) from [<80008040>] (80008040)
[166635.554409] —[ end trace fc938efffc022624 ]—
我在cpsw.c文件的cpsw_ndo_tx_timeout()函数中增加打印信息,代码如下:
static void cpsw_ndo_tx_timeout(struct net_device *ndev){ struct cpsw_priv *priv = netdev_priv(ndev); pr_err("%s: ==============================\n", __func__); { int i; u32 *p; pr_err("show cpsw_ss_stats:\n"); p = &priv->regs->id_ver; for (i = 0; i < sizeof(struct cpsw_ss_regs)/4; i++) { pr_err("ss_stats%d = %02x\n", i, __raw_readl( p + i)); } pr_err("show wr_regs:\n"); p = &priv->wr_regs->id_ver; for (i = 0; i < sizeof(struct cpsw_wr_regs)/4; i++) { pr_err("wr_reg%d = %02x\n", i, __raw_readl( p + i)); } pr_err("show cpsw_hw_stats:\n"); p = &priv->hw_stats->rxgoodframes; for (i = 0; i < sizeof(struct cpsw_hw_stats)/4; i++) { pr_err("hw_stats%d = %02x\n", i, __raw_readl(p + i)); } pr_err("show cpsw_host_regs:\n"); p = &priv->host_port_regs->max_blks; for (i = 0; i < sizeof(struct cpsw_host_regs)/4; i++) { pr_err("host_port_regs%d = %02x\n", i, __raw_readl(p + i)); } } pr_err("%s: ==============================\n", __func__); cpdma_ctlr_dump(priv->dma); msg(err, tx_err, "transmit timeout, restarting dma"); priv->stats.tx_errors++; cpsw_intr_disable(priv); cpdma_ctlr_int_ctrl(priv->dma, false); cpdma_chan_stop(priv->txch); cpdma_chan_start(priv->txch); cpdma_ctlr_int_ctrl(priv->dma, true); cpsw_intr_enable(priv); cpdma_ctlr_eoi(priv->dma);}
打印出各寄存器的数值如下:
cpsw_ndo_tx_timeout: ==============================
[166658.482025] show cpsw_ss_stats: 寄存器名称:
[166665.381395] ss_stats0 = 19010c ID_VER:[166672.104297] ss_stats1 = 02 CONTROL
[166678.071486] ss_stats2 = 00 SOFT_RESET
[166684.037651] ss_stats3 = 07 STAT_PORT_EN
[166690.016531] ss_stats4 = 00 PTYPE
[166695.983038] ss_stats5 = 00 SOFT_IDLE
[166701.950142] ss_stats6 = 3003 THRU_RATE
[166708.288702] ss_stats7 = 0b GAP_THRESH
[166714.267497] ss_stats8 = 20 TX_START_WDS
[166720.234601] ss_stats9 = 01 FLOW_CONTROL
[166726.201363] ss_stats10 = 81008100 VLAN_LTYPE
[166733.482430] ss_stats11 = 00 TS_LTYPE
[166739.635561] ss_stats12 = 80e1 DLR_LTYPE
[166746.158441] show wr_regs: 寄存器名称
[166751.951635] wr_reg0 = 4edb0100 IDVER
[166758.662931] wr_reg1 = 00 SOFT_RESET: 未发生复位
[166764.257299] wr_reg2 = 05 CONTROL: Local initiator is unconditionally placed out of idle state.
[166769.850985] wr_reg3 = 00 INT_CONTROL
[166775.457470] wr_reg4 = 00 C0_RX_THRESH_EN
[166781.052009] wr_reg5 = 00 C0_RX_EN 这里为何不是0xFF呢?
[166786.646291] wr_reg6 = 00 C0_TX_EN 这里为何不是0xFF呢?
[166792.251753] wr_reg7 = 00 C0_MISC_EN
[166797.846377] wr_reg8 = 00
[166803.440318] wr_reg9 = 00
[166809.034686] wr_reg10 = 00
[166814.826601] wr_reg11 = 00
[166820.607593] wr_reg12 = 00
[166826.387390] wr_reg13 = 00
[166832.167785] wr_reg14 = 00
[166837.959529] wr_reg15 = 00
[166843.740521] wr_reg16 = 00
[166849.520318] wr_reg17 = 00
[166855.312489] wr_reg18 = 00
[166861.092969] wr_reg19 = 00
[166866.873363] wr_reg20 = 00
[166872.653075] wr_reg21 = 00
[166878.445246] wr_reg22 = 00
[166884.225726] wr_reg23 = 00
[166890.006121] wr_reg24 = 00
[166895.797779] wr_reg25 = 00
[166901.578942] wr_reg26 = 00
[166907.358739] wr_reg27 = 00
[166913.139049] wr_reg28 = 00
[166918.931049] wr_reg29 = 00
[166924.710505] show cpsw_hw_stats:
[166931.608254] hw_stats0 = 119fa1f Good Rx Frames
[166938.517182] hw_stats1 = 4c3 Broadcast Rx Frames
[166944.670569] hw_stats2 = 50 Multicast Rx Frames
[166950.636478] hw_stats3 = 00
[166956.614931] hw_stats4 = 00
[166962.581523] hw_stats5 = 00
[166968.547945] hw_stats6 = 00
[166974.513854] hw_stats7 = 00
[166980.492478] hw_stats8 = 00
[166986.458985] hw_stats9 = 00
[166992.425406] hw_stats10 = 00
[166998.589545] hw_stats11 = 00
[167004.743273] hw_stats12 = 6440c428 Good Tx Frames
[167012.012905] hw_stats13 = 116f267 Broadcast Tx Frames
[167019.107262] hw_stats14 = 03 Multicast Tx Frames
[167025.260990] hw_stats15 = 0e Pause Tx Frames
[167031.413011] hw_stats16 = 00
[167037.565630] hw_stats17 = 00
[167043.729598] hw_stats18 = 00
[167049.882729] hw_stats19 = 00
[167056.035177] hw_stats20 = 00
[167062.199230] hw_stats21 = 00
[167068.352361] hw_stats22 = 00
[167074.504382] hw_stats23 = 00 Tx Underrun
[167080.668777] hw_stats24 = 00 Carrier Sense Errors
[167086.822078] hw_stats25 = 5aff37ad Tx Octets
[167094.091369] hw_stats26 = 298
[167100.441790] hw_stats27 = c8176
[167107.153427] hw_stats28 = 328611
[167114.050579] hw_stats29 = 67dd49
[167120.959166] hw_stats30 = d44ecb
[167127.856915] hw_stats31 = b5c175
[167134.753726] hw_stats32 = bf5798db
[167142.034878] hw_stats33 = 334a Rx Start of Frame Overruns
[167148.560318] hw_stats34 = 00 Rx Middle of Frame Overruns
[167154.712766] hw_stats35 = 3409 Rx DMA Overruns
[167161.247251] show cpsw_host_regs:
[167168.332051] host_port_regs0 = 104 P0_MAX_BLKS
[167175.601598] host_port_regs1 = 52 P0_BLK_CNT
[167182.697150] host_port_regs2 = 140c0 P0_TX_IN_CTL
[167190.338409] host_port_regs3 = 00 P0_PORT_VLAN[167197.422185] host_port_regs4 = 33221001 P0_TX_PRI_MAP
[167205.634067] host_port_regs5 = 76543210 P0_CPDMA_TX_PRI_MAP[167213.833662] host_port_regs6 = 00 P0_CPDMA_RX_CH_MAP
[167220.916243] cpsw_ndo_tx_timeout: ==============================
[167233.790995] cpsw cpsw: CPDMA: state: active
[167242.552083] cpsw cpsw: CPDMA: txidver: 180108
[167252.060179] cpsw cpsw: CPDMA: txcontrol: 1
[167261.010622] cpsw cpsw: CPDMA: txteardown: 0
[167270.146067] cpsw cpsw: CPDMA: rxidver: 180108
[167279.653907] cpsw cpsw: CPDMA: rxcontrol: 1
[167288.604521] cpsw cpsw: CPDMA: softreset: 0 CPDMA_SOFT_RESET: If a zero is read then reset has occurred.
[167297.554622] cpsw cpsw: CPDMA: rxteardown: 0
[167306.690238] cpsw cpsw: CPDMA: txintstatraw: 1
[167316.197993] cpsw cpsw: CPDMA: txintstatmasked: 0
[167326.264339] cpsw cpsw: CPDMA: txintmaskset: 0
[167335.772606] cpsw cpsw: CPDMA: txintmaskclear: 0
[167345.652414] cpsw cpsw: CPDMA: macinvector: 0
[167354.974569] cpsw cpsw: CPDMA: maceoivector: 2
[167364.482835] cpsw cpsw: CPDMA: rxintstatraw: 1
[167373.990078] cpsw cpsw: CPDMA: rxintstatmasked: 0
[167384.056254] cpsw cpsw: CPDMA: rxintmaskset: 0
[167393.564435] cpsw cpsw: CPDMA: rxintmaskclear: 0
[167403.444243] cpsw cpsw: CPDMA: dmaintstatraw: 1
[167413.138963] cpsw cpsw: CPDMA: dmaintstatmasked: 0
[167423.390825] cpsw cpsw: CPDMA: dmaintmaskset: 0
[167433.084435] cpsw cpsw: CPDMA: dmaintmaskclear: 0
[167443.156499] cpsw cpsw: channel 0 (tx 0) state active
[167453.966611] cpsw cpsw: hdp: 0
[167461.056446] cpsw cpsw: cp: 4a103c60
[167468.886633] cpsw cpsw: stats head_enqueue: 1585846
[167479.507134] cpsw cpsw: stats tail_enqueue: 16695217
[167490.313747] cpsw cpsw: stats pad_enqueue: 0
[167499.631038] cpsw cpsw: stats misqueued: 353650
[167509.506835] cpsw cpsw: stats desc_alloc_fail: 101605
[167520.499134] cpsw cpsw: stats pad_alloc_fail: 0
[167530.375529] cpsw cpsw: stats runt_receive_buff: 0
[167540.809406] cpsw cpsw: stats runt_transmit_buff: 17
[167551.616105] cpsw cpsw: stats empty_dequeue: 2761329
[167562.422547] cpsw cpsw: stats busy_dequeue: 18292963
[167573.229929] cpsw cpsw: stats good_dequeue: 18280935
[167584.035945] cpsw cpsw: stats requeue: 243271
[167593.538579] cpsw cpsw: stats teardown_dequeue: 0
[167603.789587] cpsw cpsw: channel 32 (rx 0) state active
[167614.409833] cpsw cpsw: hdp: 0
[167621.499241] cpsw cpsw: cp: 4a102660
[167629.328147] cpsw cpsw: rxfree: 0
[167636.598291] cpsw cpsw: stats head_enqueue: 1
[167646.102121] cpsw cpsw: stats tail_enqueue: 18469683
[167656.907795] cpsw cpsw: stats pad_enqueue: 0
[167666.224915] cpsw cpsw: stats misqueued: 0
[167675.170067] cpsw cpsw: stats desc_alloc_fail: 0
[167685.230867] cpsw cpsw: stats pad_alloc_fail: 0
[167695.106323] cpsw cpsw: stats runt_receive_buff: 0
[167705.540115] cpsw cpsw: stats runt_transmit_buff: 0
[167716.159934] cpsw cpsw: stats empty_dequeue: 0
[167725.849790] cpsw cpsw: stats busy_dequeue: 21054292
[167736.655806] cpsw cpsw: stats good_dequeue: 18469556
[167747.461225] cpsw cpsw: stats requeue: 0
[167756.034323] cpsw cpsw: stats teardown_dequeue: 0
根据上述寄存器分析,感觉是cpsw_wr_regs组中的c0_rx_en和c0_tx_en两个中断控制寄存器没有被使能导致的,反复分析cpsw.c源码,发现在cpsw_poll()在推出NAPI轮询处理时应该已经打开了这两个寄存器的中断使能啊,怎么还会出现这两个寄存器为零的状况呢?此外,如果软件流程有问题,那么为何90M流量时就能稳定运行呢?所以感觉不像是纯软件的问题,莫非网卡硬件异常了吗?请帮忙分析下,谢谢!
Jian Zhou:
请先参考下下面的guide:
http://processors.wiki.ti.com/index.php/AM335x_CPSW_(Ethernet)_Driver%27s_Guide#Interrupt_Pacing
如果你是百兆的PHY,数据量应该是极限了,为什么不把AM335x用在千兆工作模式。
xisheng liu:
回复 Jian Zhou:
我们的产品只需要百兆接口,所以不用千兆口。我的问题是,为何在百兆情况下网卡会异常,这不是正常现象啊!我们其他的百兆设备在百兆流量压力下,即使发生丢包,也不会产生停止收发包的问题,这个才是需要分析的点。
Room Hr:
回复 Jian Zhou:
遇到了同样的问题,请问这个问题有没有解决方案。