本文章是结合韦东山老师视频以及网上博客所写,若有一样的地方,请见谅,来源JasonTian
驱动调试方法
法一:内核打印函数printk
一: 打印原理
UBOOT传入console=ttySAC0 console=tty1
1. 内核处理UBOOT传入的参数
console_setup
add_preferred_console // 我想用名为"ttySAC0"的控制台,先记录下来
2. 硬件驱动的入口函数里:
drivers/serial/s3c2410.c
register_console(&s3c24xx_serial_console);
3. printk
vprintk
/* Emit the output into the temporary buffer */
// 先把输出信息放入临时BUFFER
vscnprintf
// Copy the output into log_buf.
// 把临时BUFFER里的数据稍作处理,再写入log_buf
// 比如printk("abc")会得到"<4>abc", 再写入log_buf
// 可以用dmesg命令把log_buf里的数据打印出来重现内核的输出信息
// 调用硬件的write函数输出
release_console_sem();
call_console_drivers(_con_start, _log_end);
// 从log_buf得到数据,算出打印级别
_call_console_drivers(start_print, cur_index, msg_level);
// 如果可以级别够格打印
if ((msg_log_level < console_loglevel
__call_console_drivers
con->write(con, &LOG_BUF(start), end - start);
二:printk函数的打印级别
printk的打印级别为0~7,在Linux内核代码中:include/linux/kernel.h中有以下参数:
#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])
具体的打印级别:
#define KERN_EMERG "<0>" /* system is unusable */
#define KERN_ALERT "<1>" /* action must be taken immediately */
#define KERN_CRIT "<2>" /* critical conditions */
#define KERN_ERR "<3>" /* error conditions */
#define KERN_WARNING "<4>" /* warning conditions */
#define KERN_NOTICE "<5>" /* normal but significant condition */
#define KERN_INFO "<6>" /* informational */
#define KERN_DEBUG "<7>" /* debug-level messages */
// 如果可以级别够格打印
if ((msg_log_level < console_loglevel
__call_console_drivers
con->write(con, &LOG_BUF(start), end - start);
- 对于
printk("<n>...."),对于n小于console_loglevel时,这个信息才被打印 - 如果
defalut_message_loglevel的值等于4,如果printk的参数开头没有“”样式字符,则在printk函数中会自动加上“<4>” - minimum_console_loglevel是一个预设值,平时不起作用,当通过其他方式来设置console_loglevel的值时,这个值不能小于minimum_console_loglevel;
- default_console_loglevel是一个预设值,平时不起作用,它表示设置console_loglevel时的默认值,通过其他某种方式来设置console_loglevel的值时会用到这个值;
三:在用户空间修改printk函数的打印级别
- 当挂接proc虚拟文件系统之后,读取/proc/sys/kernel/printk文件就可以得到console_loglevel、default_message_loglevel、minimum_console_loglevel以及default_console_loglevel各自的值; 如:
cat /proc/sys/kernel/printk
7 4 1 7
//console_loglevel=7,default_message_loglevel=4
//minimum_console_loglevel=1,default_console_loglevel=7
/* 可以使用下面的命令修改console_loglevel=2,这样所有的printk信息都不会被打印 */
echo "2 4 1 7" > /proc/sys/kernel/printk
使用demsg命令将log_buf[]信息全部打印出来。
四:uboot传递参数给内核修改打印级别
法二:查看Oops信息及栈回溯
一:使能栈回溯
Unable to handle kernel paging request at virtual address 56000050
pgd = c3ca0000
[56000050] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in: jz2440_leds
CPU: 0 Not tainted (2.6.22.6 #4)
PC is at jz2440_led_drv_open+0x3c/0xd0 [jz2440_leds]
LR is at chrdev_open+0x14c/0x164
pc : [<bf00003c>] lr : [<c008d888>] psr: 80000013
sp : c073be88 ip : c073be98 fp : c073be94
r10: 00000000 r9 : c073a000 r8 : c04debe0
r7 : 00000000 r6 : 00000000 r5 : c3ea30c0 r4 : c06f06c0
r3 : 00000000 r2 : 56000050 r1 : bf000bc4 r0 : c3ea30c0
Flags: Nzcv IRQs on FIQs on Mode SVC_32 Segment user
Control: c000717f Table: 33ca0000 DAC: 00000015
Process ledtest (pid: 801, stack limit = 0xc073a258)
Stack: (0xc073be88 to 0xc073c000)
be80: c073bebc c073be98 c008d888 bf000010 00000000 c04debe0
bea0: c3ea30c0 c008d73c c0474da0 c3ee1dac c073bee4 c073bec0 c0089e48 c008d74c
bec0: c04debe0 c073bf04 00000003 ffffff9c c002c044 c3d06000 c073befc c073bee8
bee0: c0089f64 c0089d58 00000000 00000002 c073bf68 c073bf00 c0089fb8 c0089f40
bf00: c073bf04 c3ee1dac c0474da0 00000000 00000000 c3ca1000 00000101 00000001
bf20: 00000000 c073a000 c046d508 c046d500 ffffffe8 c3d06000 c073bf68 c073bf48
bf40: c008a16c c009fc70 00000003 00000000 c04debe0 00000002 00000004 c073bf94
bf60: c073bf6c c008a2f4 c0089f88 000085a0 becbced4 000086e8 0000874c 00000005
bf80: c002c044 4013365c c073bfa4 c073bf98 c008a3a8 c008a2b0 00000000 c073bfa8
bfa0: c002bea0 c008a394 becbced4 000086e8 becbcf92 00000002 00000004 becbcf92
bfc0: becbced4 000086e8 0000874c 00000003 000085a0 00000000 4013365c becbcea8
bfe0: 00000000 becbce80 0000266c 400c98e0 60000010 becbcf92 00000000 00000000
Backtrace:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0 [jz2440_leds]) from [<c008d888>] (chrdev_open+0x14c/0x164)
[<c008d73c>] (chrdev_open+0x0/0x164) from [<c0089e48>] (__dentry_open+0x100/0x1e8)
r8:c3ee1dac r7:c0474da0 r6:c008d73c r5:c3ea30c0 r4:c04debe0
[<c0089d48>] (__dentry_open+0x0/0x1e8) from [<c0089f64>] (nameidata_to_filp+0x34/0x48)
[<c0089f30>] (nameidata_to_filp+0x0/0x48) from [<c0089fb8>] (do_filp_open+0x40/0x48)
r4:00000002
[<c0089f78>] (do_filp_open+0x0/0x48) from [<c008a2f4>] (do_sys_open+0x54/0xe4)
r5:00000004 r4:00000002
[<c008a2a0>] (do_sys_open+0x0/0xe4) from [<c008a3a8>] (sys_open+0x24/0x28)
[<c008a384>] (sys_open+0x0/0x28) from [<c002bea0>] (ret_fast_syscall+0x0/0x2c)
Code: bf00007c bf0000a0 e59f1090 e5912000 (e5923000)
Segmentation fault
通过上述Oops信息我们可以得到:
Unable to handle kernel paging request at virtual address 56000050内核访问56000050地址时发生错误Modules linked in: first_drv表明发生错误的模块名称- 发生错误各寄存器的值,如下:
pc : [<bf00003c>] lr : [<c008d888>] psr: 80000013
sp : c073be88 ip : c073be98 fp : c073be94
r10: 00000000 r9 : c073a000 r8 : c04debe0
r7 : 00000000 r6 : 00000000 r5 : c3ea30c0 r4 : c06f06c0
r3 : 00000000 r2 : 56000050 r1 : bf000bc4 r0 : c3ea30c0
4.stack信息:
Stack: (0xc073be88 to 0xc073c000)
be80: c073bebc c073be98 c008d888 bf000010 00000000 c04debe0
bea0: c3ea30c0 c008d73c c0474da0 c3ee1dac c073bee4 c073bec0 c0089e48 c008d74c
bec0: c04debe0 c073bf04 00000003 ffffff9c c002c044 c3d06000 c073befc c073bee8
bee0: c0089f64 c0089d58 00000000 00000002 c073bf68 c073bf00 c0089fb8 c0089f40
bf00: c073bf04 c3ee1dac c0474da0 00000000 00000000 c3ca1000 00000101 00000001
bf20: 00000000 c073a000 c046d508 c046d500 ffffffe8 c3d06000 c073bf68 c073bf48
bf40: c008a16c c009fc70 00000003 00000000 c04debe0 00000002 00000004 c073bf94
bf60: c073bf6c c008a2f4 c0089f88 000085a0 becbced4 000086e8 0000874c 00000005
bf80: c002c044 4013365c c073bfa4 c073bf98 c008a3a8 c008a2b0 00000000 c073bfa8
bfa0: c002bea0 c008a394 becbced4 000086e8 becbcf92 00000002 00000004 becbcf92
bfc0: becbced4 000086e8 0000874c 00000003 000085a0 00000000 4013365c becbcea8
bfe0: 00000000 becbce80 0000266c 400c98e0 60000010 becbcf92 00000000 00000000
5.栈回溯信息,可以看到函数调用,可以看到是jz2440_led_drv_open函数里面发生错误
Backtrace:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0 [jz2440_leds]) from [<c008d888>] (chrdev_open+0x14c/0x164)
[<c008d73c>] (chrdev_open+0x0/0x164) from [<c0089e48>] (__dentry_open+0x100/0x1e8)
r8:c3ee1dac r7:c0474da0 r6:c008d73c r5:c3ea30c0 r4:c04debe0
[<c0089d48>] (__dentry_open+0x0/0x1e8) from [<c0089f64>] (nameidata_to_filp+0x34/0x48)
[<c0089f30>] (nameidata_to_filp+0x0/0x48) from [<c0089fb8>] (do_filp_open+0x40/0x48)
r4:00000002
[<c0089f78>] (do_filp_open+0x0/0x48) from [<c008a2f4>] (do_sys_open+0x54/0xe4)
r5:00000004 r4:00000002
[<c008a2a0>] (do_sys_open+0x0/0xe4) from [<c008a3a8>] (sys_open+0x24/0x28)
[<c008a384>] (sys_open+0x0/0x28) from [<c002bea0>] (ret_fast_syscall+0x0/0x2c)
6.出错指令附近的指令机器码,比如(出错指令在小括号内)
Code: bf00007c bf0000a0 e59f1090 e5912000 (e5923000)
一:分析Oops信息
- 若内核没有配置
CONFIG_FRAME_POINTER,那么将不会有栈回溯信息显示,这里内核已经配置,打印部分以下栈回溯信息:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0
[jz2440_leds]) from [<c008d888>]
(chrdev_open+0x14c/0x164)
这里包含以下信息:
- 表示后面的chrdev_open函数调用前面的jz2440_led_drv_open函数,前面“bf000000”是jz2440_led_drv_open函数首地址偏移0的地址,这个函数大小为0xd0。后面部分:“c008d888”是chrdev_open函数首地址偏移0x14c的地址,且为jz2440_led_drv_open执行后的返回地址,这个函数大小为0x164;
- 根据前面栈回溯信息,我们可以得到函数调用过程
ret_fast_syscall ->
sys_open ->
do_sys_open ->
do_filp_open ->
nameidata_to_filp ->
__dentry_open ->
chrdev_open ->
jz2440_led_drv_open
- 根据PC寄存器确定出错位置
PC is at jz2440_led_drv_open+0x3c/0xd0 [jz2440_leds]
LR is at chrdev_open+0x14c/0x164
pc : [<bf00003c>] lr : [<c008d888>] psr: 80000013
"PC is at jz2440_led_drv_open+0x3c/0xd0"表示出错指令为jz2440_led_drv_open函数中偏移为0x3c的指令; “pc : []”表示出错指令的地址为0xbf00003c;
二:未使能栈回溯
- 确定模块地址是内核模块还是外加模块的,属于外部模块
- 查看内核编译之后源码目录下System.map文件来确定内核的函数的地址范围为c0004000~c03cecb4,所以0xbf00003c不属于内核而是外加模块的
- 查看cat /proc/kallsyms(所有内核函数,加载函数的地址),找到一个相近的地址来确定属于哪个模块,由于文件较大,可以使用cat /proc/kallsyms > kallsyms.txt,导出到kallsyms.txt查看;可以找到“bf000000 t jz2440_led_drv_open [jz2440_leds]”,可以确定是在jz2440_leds.ko这个模块中;
- 找到对应的jz2440_leds.ko文件,反汇编"arm-linux-objdump -D jz2440_leds.ko > jz2440_leds.dis",查看反汇编文件找到对应的函数,再根据目前的PC指代的地址和找到的相近的地址算出偏移值,最终可定位到哪条语句发生了错误;
- 编入内核
- 查看出错地址属于内核还是外部模块
- 根据栈信息给出的pc值确定出错位置
- 反汇编原始的的内核文件,
例子:
Modules linked in:
CPU: 0 Not tainted (2.6.22.6 #2)
PC is at first_drv_open+0x18/0x3c
LR is at chrdev_open+0x14c/0x164
pc : [<c014e6c0>] lr : [<c008638c>] psr: a0000013
sp : c3a03e88 ip : c3a03e98 fp : c3a03e94
r10: 00000000 r9 : c3a02000 r8 : c03f3c60
r7 : 00000000 r6 : 00000000 r5 : c38a0c50 r4 : c3c1e780
r3 : c014e6a8 r2 : 56000050 r1 : c031a47c r0 : 00000000
Flags: NzCv IRQs on FIQs on Mode SVC_32 Segment user
Control: c000717f Table: 339f0000 DAC: 00000015
Process firstdrvtest (pid: 750, stack limit = 0xc3a02258)
1. 根据pc值确定该指令属于内核还是外加的模块
pc=c014e6c0 属于内核(看System.map)
2. 反汇编内核: arm-linux-objdump -D vmlinux > vmlinux.dis
在dis文件里搜c014e6c0
c014e6a8 <first_drv_open>:
c014e6a8: e1a0c00d mov ip, sp
c014e6ac: e92dd800 stmdb sp!, {fp, ip, lr, pc}
c014e6b0: e24cb004 sub fp, ip, #4 ; 0x4
c014e6b4: e59f1024 ldr r1, [pc, #36] ; c014e6e0 <.text+0x1276e0>
c014e6b8: e3a00000 mov r0, #0 ; 0x0
c014e6bc: e5912000 ldr r2, [r1]
c014e6c0: e5923000 ldr r3, [r2] // 在此出错 r2=56000050
3. 根据栈信息分析函数调用过程
# ./firstdrvtest on
Unable to handle kernel paging request at virtual address 56000050
pgd = c3e78000
[56000050] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in: first_drv
CPU: 0 Not tainted (2.6.22.6 #48)
PC is at first_drv_open+0x18/0x3c [first_drv]
LR is at chrdev_open+0x14c/0x164
pc : [<bf000018>] lr : [<c008c888>] psr: a0000013
3.1 根据PC确定出错位置
bf000018 属于 insmod的模块
bf000000 t first_drv_open [first_drv]
3.2 确定它属于哪个函数
反汇编first_drv.ko
法三:修改内核来定位系统僵死问题
原理:因为系统时钟中断是不断产生的,代码输入Timer_Tick找到系统时钟中断处理函数:这里找到asm_do_irq函数,添加代码
/*如果10s之内都是同一个进程运行,就打印*/
static pid_t pre_pid;
static int cnt = 0;
if(pre_pid == current->pid)
{
cnt++
}
else
{
cnt=0;
pre_pid == current->pid
}
if(cnt == 10 * HZ)
{
cnt = 0;
printk("pc=%08x\n",regs->ARM_PC)
}
./firstdrvtest on
asm_do_IRQ => s3c2410_timer_interrupt : pid = 752, task name = firstdrvtest
pc = bf000084
asm_do_IRQ => s3c2410_timer_interrupt : pid = 752, task name = firstdrvtest
pc = bf000084 // 对于中断, pc-4才是发生中断瞬间的地址
看systemp.map查看地址属于内核模块还是外部模块,查看/proc/kallsyms查看pc地址,反汇编first_drv.dis
看/proc/kallsyms
first_drv.dis
00000000 <first_drv_open>: bf000000 t first_drv_open [first_drv]
00000000 <first_drv_open>: bf000000 t first_drv_open [first_drv]
0000003c <first_drv_write>:
3c: e1a0c00d mov ip, sp
40: e92dd800 stmdb sp!, {fp, ip, lr, pc}
44: e24cb004 sub fp, ip, #4 ; 0x4
48: e24dd004 sub sp, sp, #4 ; 0x4
4c: e3cd3d7f bic r3, sp, #8128 ; 0x1fc0
50: e3c3303f bic r3, r3, #63 ; 0x3f
54: e5933008 ldr r3, [r3, #8]
58: e0910002 adds r0, r1, r2
5c: 30d00003 sbcccs r0, r0, r3
60: 33a03000 movcc r3, #0 ; 0x0
64: e3530000 cmp r3, #0 ; 0x0
68: e24b0010 sub r0, fp, #16 ; 0x10
6c: 1a00001c bne e4 <init_module+0x5c>
70: ebfffffe bl 70 <first_drv_write+0x34>
74: ea00001f b f8 <init_module+0x70>
78: e3520000 cmp r2, #0 ; 0x0
7c: 11a01002 movne r1, r2
80: 1bfffffe blne 80 <first_drv_write+0x44> // 卡死的地方
84: ea00001f b 108 <init_module+0x80>
本文使用 mdnice 排版