驱动调试方法

681 阅读15分钟

本文章是结合韦东山老师视频以及网上博客所写,若有一样的地方,请见谅,来源JasonTian

驱动调试方法

法一:内核打印函数printk

一: 打印原理

UBOOT传入console=ttySAC0 console=tty1
1. 内核处理UBOOT传入的参数
console_setup
 add_preferred_console // 我想用名为"ttySAC0"的控制台,先记录下来

2. 硬件驱动的入口函数里:
 drivers/serial/s3c2410.c
  register_console(&s3c24xx_serial_console);  

3. printk
  vprintk
   /* Emit the output into the temporary buffer */
   // 先把输出信息放入临时BUFFER
   vscnprintf
   
   // Copy the output into log_buf.
   // 把临时BUFFER里的数据稍作处理,再写入log_buf
   // 比如printk("abc")会得到"<4>abc", 再写入log_buf
   // 可以用dmesg命令把log_buf里的数据打印出来重现内核的输出信息
   
   
   // 调用硬件的write函数输出
   release_console_sem();
   call_console_drivers(_con_start, _log_end);
   // 从log_buf得到数据,算出打印级别
   _call_console_drivers(start_print, cur_index, msg_level);   
   // 如果可以级别够格打印
   if ((msg_log_level < console_loglevel
   __call_console_drivers
   con->write(con, &LOG_BUF(start), end - start);

二:printk函数的打印级别

printk的打印级别为0~7,在Linux内核代码中:include/linux/kernel.h中有以下参数:

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

具体的打印级别:

#define KERN_EMERG  "<0>" /* system is unusable   */
#define KERN_ALERT  "<1>" /* action must be taken immediately */
#define KERN_CRIT  "<2>" /* critical conditions   */
#define KERN_ERR  "<3>" /* error conditions   */
#define KERN_WARNING "<4>" /* warning conditions   */
#define KERN_NOTICE  "<5>" /* normal but significant condition */
#define KERN_INFO  "<6>" /* informational   */
#define KERN_DEBUG  "<7>" /* debug-level messages   */
// 如果可以级别够格打印
if ((msg_log_level < console_loglevel
  __call_console_drivers
 con->write(con, &LOG_BUF(start), end - start);
  1. 对于printk("<n>...."),对于n小于console_loglevel时,这个信息才被打印
  2. 如果defalut_message_loglevel的值等于4,如果printk的参数开头没有“”样式字符,则在printk函数中会自动加上“<4>”
  3. minimum_console_loglevel是一个预设值,平时不起作用,当通过其他方式来设置console_loglevel的值时,这个值不能小于minimum_console_loglevel;
  4. default_console_loglevel是一个预设值,平时不起作用,它表示设置console_loglevel时的默认值,通过其他某种方式来设置console_loglevel的值时会用到这个值;

三:在用户空间修改printk函数的打印级别

  • 当挂接proc虚拟文件系统之后,读取/proc/sys/kernel/printk文件就可以得到console_loglevel、default_message_loglevel、minimum_console_loglevel以及default_console_loglevel各自的值; 如:
 cat /proc/sys/kernel/printk
7       4       1       7   
//console_loglevel=7,default_message_loglevel=4
//minimum_console_loglevel=1,default_console_loglevel=7
/* 可以使用下面的命令修改console_loglevel=2,这样所有的printk信息都不会被打印 */
echo "2 4 1 7" > /proc/sys/kernel/printk 

使用demsg命令将log_buf[]信息全部打印出来。

四:uboot传递参数给内核修改打印级别

法二:查看Oops信息及栈回溯

一:使能栈回溯

Unable to handle kernel paging request at virtual address 56000050
pgd = c3ca0000
[56000050] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in: jz2440_leds
CPU: 0    Not tainted  (2.6.22.6 #4)
PC is at jz2440_led_drv_open+0x3c/0xd0 [jz2440_leds]
LR is at chrdev_open+0x14c/0x164
pc : [<bf00003c>]    lr : [<c008d888>]    psr: 80000013
sp : c073be88  ip : c073be98  fp : c073be94
r10: 00000000  r9 : c073a000  r8 : c04debe0
r7 : 00000000  r6 : 00000000  r5 : c3ea30c0  r4 : c06f06c0
r3 : 00000000  r2 : 56000050  r1 : bf000bc4  r0 : c3ea30c0
Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  Segment user
Control: c000717f  Table: 33ca0000  DAC: 00000015
Process ledtest (pid: 801, stack limit = 0xc073a258)
Stack: (0xc073be88 to 0xc073c000)
be80:  c073bebc c073be98 c008d888 bf000010 00000000 c04debe0
bea0: c3ea30c0 c008d73c c0474da0 c3ee1dac c073bee4 c073bec0 c0089e48 c008d74c
bec0: c04debe0 c073bf04 00000003 ffffff9c c002c044 c3d06000 c073befc c073bee8
bee0: c0089f64 c0089d58 00000000 00000002 c073bf68 c073bf00 c0089fb8 c0089f40
bf00: c073bf04 c3ee1dac c0474da0 00000000 00000000 c3ca1000 00000101 00000001
bf20: 00000000 c073a000 c046d508 c046d500 ffffffe8 c3d06000 c073bf68 c073bf48
bf40: c008a16c c009fc70 00000003 00000000 c04debe0 00000002 00000004 c073bf94
bf60: c073bf6c c008a2f4 c0089f88 000085a0 becbced4 000086e8 0000874c 00000005
bf80: c002c044 4013365c c073bfa4 c073bf98 c008a3a8 c008a2b0 00000000 c073bfa8
bfa0: c002bea0 c008a394 becbced4 000086e8 becbcf92 00000002 00000004 becbcf92
bfc0: becbced4 000086e8 0000874c 00000003 000085a0 00000000 4013365c becbcea8
bfe0: 00000000 becbce80 0000266c 400c98e0 60000010 becbcf92 00000000 00000000
Backtrace:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0 [jz2440_leds]) from [<c008d888>] (chrdev_open+0x14c/0x164)
[<c008d73c>] (chrdev_open+0x0/0x164) from [<c0089e48>] (__dentry_open+0x100/0x1e8)
r8:c3ee1dac r7:c0474da0 r6:c008d73c r5:c3ea30c0 r4:c04debe0
[<c0089d48>] (__dentry_open+0x0/0x1e8) from [<c0089f64>] (nameidata_to_filp+0x34/0x48)
[<c0089f30>] (nameidata_to_filp+0x0/0x48) from [<c0089fb8>] (do_filp_open+0x40/0x48)
r4:00000002
[<c0089f78>] (do_filp_open+0x0/0x48) from [<c008a2f4>] (do_sys_open+0x54/0xe4)
r5:00000004 r4:00000002
[<c008a2a0>] (do_sys_open+0x0/0xe4) from [<c008a3a8>] (sys_open+0x24/0x28)
[<c008a384>] (sys_open+0x0/0x28) from [<c002bea0>] (ret_fast_syscall+0x0/0x2c)
Code: bf00007c bf0000a0 e59f1090 e5912000 (e5923000)
Segmentation fault 

通过上述Oops信息我们可以得到:

  1. Unable to handle kernel paging request at virtual address 56000050内核访问56000050地址时发生错误
  2. Modules linked in: first_drv表明发生错误的模块名称
  3. 发生错误各寄存器的值,如下:
pc : [<bf00003c>]    lr : [<c008d888>]    psr: 80000013
sp : c073be88  ip : c073be98  fp : c073be94
r10: 00000000  r9 : c073a000  r8 : c04debe0
r7 : 00000000  r6 : 00000000  r5 : c3ea30c0  r4 : c06f06c0
r3 : 00000000  r2 : 56000050  r1 : bf000bc4  r0 : c3ea30c0

4.stack信息:

Stack: (0xc073be88 to 0xc073c000)
be80:                   c073bebc c073be98 c008d888 bf000010 00000000 c04debe0
bea0: c3ea30c0 c008d73c c0474da0 c3ee1dac c073bee4 c073bec0 c0089e48 c008d74c
bec0: c04debe0 c073bf04 00000003 ffffff9c c002c044 c3d06000 c073befc c073bee8
bee0: c0089f64 c0089d58 00000000 00000002 c073bf68 c073bf00 c0089fb8 c0089f40
bf00: c073bf04 c3ee1dac c0474da0 00000000 00000000 c3ca1000 00000101 00000001
bf20: 00000000 c073a000 c046d508 c046d500 ffffffe8 c3d06000 c073bf68 c073bf48
bf40: c008a16c c009fc70 00000003 00000000 c04debe0 00000002 00000004 c073bf94
bf60: c073bf6c c008a2f4 c0089f88 000085a0 becbced4 000086e8 0000874c 00000005
bf80: c002c044 4013365c c073bfa4 c073bf98 c008a3a8 c008a2b0 00000000 c073bfa8
bfa0: c002bea0 c008a394 becbced4 000086e8 becbcf92 00000002 00000004 becbcf92
bfc0: becbced4 000086e8 0000874c 00000003 000085a0 00000000 4013365c becbcea8
bfe0: 00000000 becbce80 0000266c 400c98e0 60000010 becbcf92 00000000 00000000

5.栈回溯信息,可以看到函数调用,可以看到是jz2440_led_drv_open函数里面发生错误

Backtrace:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0 [jz2440_leds]) from [<c008d888>] (chrdev_open+0x14c/0x164)
[<c008d73c>] (chrdev_open+0x0/0x164) from [<c0089e48>] (__dentry_open+0x100/0x1e8)
 r8:c3ee1dac r7:c0474da0 r6:c008d73c r5:c3ea30c0 r4:c04debe0
[<c0089d48>] (__dentry_open+0x0/0x1e8) from [<c0089f64>] (nameidata_to_filp+0x34/0x48)
[<c0089f30>] (nameidata_to_filp+0x0/0x48) from [<c0089fb8>] (do_filp_open+0x40/0x48)
 r4:00000002
[<c0089f78>] (do_filp_open+0x0/0x48) from [<c008a2f4>] (do_sys_open+0x54/0xe4)
 r5:00000004 r4:00000002
[<c008a2a0>] (do_sys_open+0x0/0xe4) from [<c008a3a8>] (sys_open+0x24/0x28)
[<c008a384>] (sys_open+0x0/0x28) from [<c002bea0>] (ret_fast_syscall+0x0/0x2c)

6.出错指令附近的指令机器码,比如(出错指令在小括号内)

Code: bf00007c bf0000a0 e59f1090 e5912000 (e5923000)
一:分析Oops信息
  1. 若内核没有配置CONFIG_FRAME_POINTER,那么将不会有栈回溯信息显示,这里内核已经配置,打印部分以下栈回溯信息:
[<bf000000>] (jz2440_led_drv_open+0x0/0xd0 
[jz2440_leds]) from [<c008d888>]
(chrdev_open+0x14c/0x164) 

这里包含以下信息:

  1. 表示后面的chrdev_open函数调用前面的jz2440_led_drv_open函数,前面“bf000000”是jz2440_led_drv_open函数首地址偏移0的地址,这个函数大小为0xd0。后面部分:“c008d888”是chrdev_open函数首地址偏移0x14c的地址,且为jz2440_led_drv_open执行后的返回地址,这个函数大小为0x164;
  2. 根据前面栈回溯信息,我们可以得到函数调用过程
ret_fast_syscall ->
sys_open ->
do_sys_open ->
do_filp_open ->
nameidata_to_filp ->
__dentry_open ->
chrdev_open ->
jz2440_led_drv_open
  1. 根据PC寄存器确定出错位置
PC is at jz2440_led_drv_open+0x3c/0xd0 [jz2440_leds]
LR is at chrdev_open+0x14c/0x164
pc : [<bf00003c>]    lr : [<c008d888>]    psr: 80000013

"PC is at jz2440_led_drv_open+0x3c/0xd0"表示出错指令为jz2440_led_drv_open函数中偏移为0x3c的指令; “pc : []”表示出错指令的地址为0xbf00003c;

二:未使能栈回溯

  1. 确定模块地址是内核模块还是外加模块的,属于外部模块
  • 查看内核编译之后源码目录下System.map文件来确定内核的函数的地址范围为c0004000~c03cecb4,所以0xbf00003c不属于内核而是外加模块的
  • 查看cat /proc/kallsyms(所有内核函数,加载函数的地址),找到一个相近的地址来确定属于哪个模块,由于文件较大,可以使用cat /proc/kallsyms > kallsyms.txt,导出到kallsyms.txt查看;可以找到“bf000000 t jz2440_led_drv_open [jz2440_leds]”,可以确定是在jz2440_leds.ko这个模块中;
  • 找到对应的jz2440_leds.ko文件,反汇编"arm-linux-objdump -D jz2440_leds.ko > jz2440_leds.dis",查看反汇编文件找到对应的函数,再根据目前的PC指代的地址和找到的相近的地址算出偏移值,最终可定位到哪条语句发生了错误;
  1. 编入内核
  • 查看出错地址属于内核还是外部模块
  • 根据栈信息给出的pc值确定出错位置
  • 反汇编原始的的内核文件,

例子:

Modules linked in:
CPU: 0    Not tainted  (2.6.22.6 #2)
PC is at first_drv_open+0x18/0x3c
LR is at chrdev_open+0x14c/0x164
pc : [<c014e6c0>]    lr : [<c008638c>]    psr: a0000013
sp : c3a03e88  ip : c3a03e98  fp : c3a03e94
r10: 00000000  r9 : c3a02000  r8 : c03f3c60
r7 : 00000000  r6 : 00000000  r5 : c38a0c50  r4 : c3c1e780
r3 : c014e6a8  r2 : 56000050  r1 : c031a47c  r0 : 00000000
Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  Segment user
Control: c000717f  Table: 339f0000  DAC: 00000015
Process firstdrvtest (pid: 750, stack limit = 0xc3a02258)

1. 根据pc值确定该指令属于内核还是外加的模块
pc=c014e6c0 属于内核(看System.map)

2. 反汇编内核: arm-linux-objdump -D vmlinux > vmlinux.dis
在dis文件里搜c014e6c0
c014e6a8 <first_drv_open>:
c014e6a8:       e1a0c00d        mov     ip, sp
c014e6ac:       e92dd800        stmdb   sp!, {fp, ip, lr, pc}
c014e6b0:       e24cb004        sub     fp, ip, #4      ; 0x4
c014e6b4:       e59f1024        ldr     r1, [pc, #36]   ; c014e6e0 <.text+0x1276e0>
c014e6b8:       e3a00000        mov     r0, #0  ; 0x0
c014e6bc:       e5912000        ldr     r2, [r1]
c014e6c0:       e5923000        ldr     r3, [r2] // 在此出错 r2=56000050

3. 根据栈信息分析函数调用过程
# ./firstdrvtest on
Unable to handle kernel paging request at virtual address 56000050
pgd = c3e78000
[56000050] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in: first_drv
CPU: 0    Not tainted  (2.6.22.6 #48)
PC is at first_drv_open+0x18/0x3c [first_drv]
LR is at chrdev_open+0x14c/0x164
pc : [<bf000018>]    lr : [<c008c888>]    psr: a0000013

3.1 根据PC确定出错位置
bf000018 属于 insmod的模块
bf000000 t first_drv_open       [first_drv]

3.2 确定它属于哪个函数
反汇编first_drv.ko

法三:修改内核来定位系统僵死问题

原理:因为系统时钟中断是不断产生的,代码输入Timer_Tick找到系统时钟中断处理函数:这里找到asm_do_irq函数,添加代码

/*如果10s之内都是同一个进程运行,就打印*/
static pid_t pre_pid;
static int cnt = 0;
if(pre_pid == current->pid)
{
  cnt++  
}
else
{
  cnt=0;
  pre_pid == current->pid
}
if(cnt == 10 * HZ)
{
 cnt = 0;
printk("pc=%08x\n",regs->ARM_PC)
}
./firstdrvtest on 
asm_do_IRQ => s3c2410_timer_interrupt : pid = 752, task name = firstdrvtest
pc = bf000084
asm_do_IRQ => s3c2410_timer_interrupt : pid = 752, task name = firstdrvtest
pc = bf000084   // 对于中断, pc-4才是发生中断瞬间的地址

看systemp.map查看地址属于内核模块还是外部模块,查看/proc/kallsyms查看pc地址,反汇编first_drv.dis

看/proc/kallsyms    
first_drv.dis
00000000 <first_drv_open>:                     bf000000 t first_drv_open [first_drv]      
00000000 <first_drv_open>:                     bf000000 t first_drv_open [first_drv]         
0000003c <first_drv_write>:
  3c: e1a0c00d  mov ip, sp
  40: e92dd800  stmdb sp!, {fp, ip, lr, pc}
  44: e24cb004  sub fp, ip, #4 ; 0x4
  48: e24dd004  sub sp, sp, #4 ; 0x4
  4c: e3cd3d7f  bic r3, sp, #8128 ; 0x1fc0
  50: e3c3303f  bic r3, r3, #63 ; 0x3f
  54: e5933008  ldr r3, [r3, #8]
  58: e0910002  adds r0, r1, r2
  5c: 30d00003  sbcccs r0, r0, r3
  60: 33a03000  movcc r3, #0 ; 0x0
  64: e3530000  cmp r3, #0 ; 0x0
  68: e24b0010  sub r0, fp, #16 ; 0x10
  6c: 1a00001c  bne e4 <init_module+0x5c>
  70: ebfffffe  bl 70 <first_drv_write+0x34>
  74: ea00001f  b f8 <init_module+0x70>
  78: e3520000  cmp r2, #0 ; 0x0
  7c: 11a01002  movne r1, r2
  80: 1bfffffe  blne 80 <first_drv_write+0x44>       // 卡死的地方
  84: ea00001f  b 108 <init_module+0x80>

本文使用 mdnice 排版