socket()函数概述
为了建立Socket,程序可以调用Socket函数,函数返回一个文件描述符,原型为:
int socket(int domain, int type, int protocol);
domain:指定协议簇,常见的有AF_IENT, AF_INET6, AF_UNIX/AF_LOCAL, AF_NETLINK等。
type:指定类型,常见的有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW等。
protocol:指定传输协议,如TCP/UDP等,不同协议簇都对应了默认的协议,可以填充 0 使用默认值。
glibc/sysdeps/generic/socket.c
#include <sysdep-cancel.h>
#include <socketcall.h>
#include <tls.h>
#define P(a, b) P2(a, b)
#define P2(a, b) a##b
.text
#ifndef __socket
# ifndef NO_WEAK_ALIAS
# define __socket P(__,socket)
# else
# define __socket socket
# endif
#endif
.globl __socket
cfi_startproc
ENTRY (__socket)
#if defined NEED_CANCELLATION && defined CENABLE
SINGLE_THREAD_P
jne 1f
#endif
/* Save registers. */
movl %ebx, %edx
cfi_register (3, 2)
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $P(SOCKOP_,socket), %ebx /* Subcode is first arg to syscall. */
lea 4(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
ENTER_KERNEL
/* Restore registers. */
movl %edx, %ebx
cfi_restore (3)
/* %eax is < 0 if there was an error. */
cmpl $-125, %eax
jae SYSCALL_ERROR_LABEL
/* Successful; return the syscall's value. */
L(pseudo_end):
ret
#if defined NEED_CANCELLATION && defined CENABLE
/* We need one more register. */
1: pushl %esi
cfi_adjust_cfa_offset(4)
/* Enable asynchronous cancellation. */
CENABLE
movl %eax, %esi
cfi_offset(6, -8) /* %esi */
/* Save registers. */
movl %ebx, %edx
cfi_register (3, 2)
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
/* Use ## so `socket' is a separate token that might be #define'd. */
movl $P(SOCKOP_,socket), %ebx /* Subcode is first arg to syscall. */
lea 8(%esp), %ecx /* Address of args is 2nd arg. */
/* Do the system call trap. */
ENTER_KERNEL
/* Restore registers. */
movl %edx, %ebx
cfi_restore (3)
/* Restore the cancellation. */
xchgl %esi, %eax
CDISABLE
/* Restore registers. */
movl %esi, %eax
popl %esi
cfi_restore (6)
cfi_adjust_cfa_offset(-4)
/* %eax is < 0 if there was an error. */
cmpl $-125, %eax
jae SYSCALL_ERROR_LABEL
/* Successful; return the syscall's value. */
ret
#endif
cfi_endproc
PSEUDO_END (__socket)
#ifndef NO_WEAK_ALIAS
weak_alias (__socket, socket)
#endif
对于i386系统内核来说,ENTER_KERNEL宏定义为
#define ENTER_KERNEL int $0x80
#define SYS_ify(syscall_name) __NR_##syscall_name
movl $SYS_ify(socketcall), %eax /* System call number in %eax. */
这一句会被替换为
movl $__NR_socketcall, %eax
综上所述,我们将系统调用号放入eax寄存器中,将SOCKOP_socket(socket的调用号)放入ebx寄存器中。代码lea 4(% esp), %ecx是将调用socket()时的参数地址保存在寄存器ecx中。
接下来由于int 0x80,调用系统调用~
Linux/arch/x86/kernel/entry_32.S
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
pushl %eax # save orig_eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp) # store the return value
system_call()最终使用汇编call指令执行sys_call_table系统调用表102(%eax)处的函数指针。
系统调用表sys_call_table见下。
linux/arch/x86/kernel/syscall_table_32.S
ENTRY(sys_call_table)
.long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
.long sys_exit
.long sys_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
.....
.long sys_socketcall /* 102 */
也就是说,调用的是sys_socketcall()这个函数。
函数sys_socketcall()是内核提供给socket通信的总入口。
linux/net/socket.c中的sys_socketcall()
asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
unsigned long a[6];
unsigned long a0, a1;
int err;
if (call < 1 || call > SYS_RECVMSG)
return -EINVAL;
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, nargs[call]))
return -EFAULT;
err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
if (err)
return err;
a0 = a[0];
a1 = a[1];
switch (call) {
case SYS_SOCKET:
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
err =
sys_accept(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_GETSOCKNAME:
err =
sys_getsockname(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_GETPEERNAME:
err =
sys_getpeername(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_SOCKETPAIR:
err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
case SYS_RECV:
err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_RECVFROM:
err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4],
(int __user *)a[5]);
break;
case SYS_SHUTDOWN:
err = sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
break;
case SYS_GETSOCKOPT:
err =
sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
(int __user *)a[4]);
break;
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
default:
err = -EINVAL;
break;
}
return err;
}
sys_socketcall()函数的参数都是由ebx和ecx来传递的。
参数call是具体的socket调用号(ebx,值为1)
参数args是指针,是ecx寄存器传递的参数数组。
由于服务器程序在用户空间,而系统调用函数在内核空间,需要将这些参数从服务器程序复制到内核中,即从用户空间复制到内核空间,复制函数为copy_from_user()
从此,流程由sys_socketcall()-->sys_socket()。
linux/net/socket.c中的sys_socket()
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock);
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
sock_create()的作用就是创建一个服务器的socket。
sock_map_fd()函数为新建的socket在网络文件系统中申请文件号和文件描述符结构。
网络文件系统
在linux/net/socket.c中定义了sock_fs_type这种fife_system_type类型的对象。
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.get_sb = sockfs_get_sb,
.kill_sb = kill_anon_super,
};
见面知意,这个结构体file_system_type代表linux内核的各种文件系统,每一种文件系统必须要有自己的file_system_type机构。
由此,sock_fs_type结构定义代表sockfs的网络文件系统,但它并没有真实的物理介质,因此称为虚拟文件系统。