socket的创建(一)

162 阅读2分钟

socket()函数概述

为了建立Socket,程序可以调用Socket函数,函数返回一个文件描述符,原型为: 

int socket(int domain, int type, int protocol)

domain:指定协议簇,常见的有AF_IENT, AF_INET6, AF_UNIX/AF_LOCAL, AF_NETLINK等。
type:指定类型,常见的有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW等。
protocol:指定传输协议,如TCP/UDP等,不同协议簇都对应了默认的协议,可以填充 0 使用默认值。

glibc/sysdeps/generic/socket.c


#include <sysdep-cancel.h>
#include <socketcall.h>
#include <tls.h>

#define P(a, b) P2(a, b)
#define P2(a, b) a##b

	.text

#ifndef __socket
# ifndef NO_WEAK_ALIAS
#  define __socket P(__,socket)
# else
#  define __socket socket
# endif
#endif

.globl __socket
	cfi_startproc
ENTRY (__socket)
#if defined NEED_CANCELLATION && defined CENABLE
	SINGLE_THREAD_P
	jne 1f
#endif

	/* Save registers.  */
	movl %ebx, %edx
	cfi_register (3, 2)

	movl $SYS_ify(socketcall), %eax	/* System call number in %eax.  */

	/* Use ## so `socket' is a separate token that might be #define'd.  */
	movl $P(SOCKOP_,socket), %ebx	/* Subcode is first arg to syscall.  */
	lea 4(%esp), %ecx		/* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
	ENTER_KERNEL

	/* Restore registers.  */
	movl %edx, %ebx
	cfi_restore (3)

	/* %eax is < 0 if there was an error.  */
	cmpl $-125, %eax
	jae SYSCALL_ERROR_LABEL

	/* Successful; return the syscall's value.  */
L(pseudo_end):
	ret


#if defined NEED_CANCELLATION && defined CENABLE
	/* We need one more register.  */
1:	pushl %esi
	cfi_adjust_cfa_offset(4)

	/* Enable asynchronous cancellation.  */
	CENABLE
	movl %eax, %esi
	cfi_offset(6, -8)		/* %esi */

	/* Save registers.  */
	movl %ebx, %edx
	cfi_register (3, 2)

	movl $SYS_ify(socketcall), %eax	/* System call number in %eax.  */

	/* Use ## so `socket' is a separate token that might be #define'd.  */
	movl $P(SOCKOP_,socket), %ebx	/* Subcode is first arg to syscall.  */
	lea 8(%esp), %ecx		/* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
	ENTER_KERNEL

	/* Restore registers.  */
	movl %edx, %ebx
	cfi_restore (3)

	/* Restore the cancellation.  */
	xchgl %esi, %eax
	CDISABLE

	/* Restore registers.  */
	movl %esi, %eax
	popl %esi
	cfi_restore (6)
	cfi_adjust_cfa_offset(-4)

	/* %eax is < 0 if there was an error.  */
	cmpl $-125, %eax
	jae SYSCALL_ERROR_LABEL

	/* Successful; return the syscall's value.  */
	ret
#endif
	cfi_endproc
PSEUDO_END (__socket)

#ifndef NO_WEAK_ALIAS
weak_alias (__socket, socket)
#endif

对于i386系统内核来说,ENTER_KERNEL宏定义为

#define ENTER_KERNEL int $0x80
#define SYS_ify(syscall_name) __NR_##syscall_name
movl $SYS_ify(socketcall), %eax	/* System call number in %eax.  */
这一句会被替换为
movl $__NR_socketcall, %eax

综上所述,我们将系统调用号放入eax寄存器中,将SOCKOP_socket(socket的调用号)放入ebx寄存器中。代码lea 4(% esp), %ecx是将调用socket()时的参数地址保存在寄存器ecx中。

接下来由于int 0x80,调用系统调用~

Linux/arch/x86/kernel/entry_32.S

	# system call handler stub
ENTRY(system_call)
	RING0_INT_FRAME			# can't unwind into user space anyway
	pushl %eax			# save orig_eax
	CFI_ADJUST_CFA_OFFSET 4
	SAVE_ALL
	GET_THREAD_INFO(%ebp)
					# system call tracing in operation / emulation
	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
	jnz syscall_trace_entry
	cmpl $(nr_syscalls), %eax
	jae syscall_badsys
syscall_call:
	call *sys_call_table(,%eax,4)
	movl %eax,PT_EAX(%esp)		# store the return value


system_call()最终使用汇编call指令执行sys_call_table系统调用表102(%eax)处的函数指针。

系统调用表sys_call_table见下。

linux/arch/x86/kernel/syscall_table_32.S

ENTRY(sys_call_table)
	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
	.long sys_exit
	.long sys_fork
	.long sys_read
	.long sys_write
	.long sys_open		/* 5 */
        .....
        .long sys_socketcall    /* 102 */

也就是说,调用的是sys_socketcall()这个函数。

函数sys_socketcall()是内核提供给socket通信的总入口。

linux/net/socket.c中的sys_socketcall()

asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
	unsigned long a[6];
	unsigned long a0, a1;
	int err;

	if (call < 1 || call > SYS_RECVMSG)
		return -EINVAL;

	/* copy_from_user should be SMP safe. */
	if (copy_from_user(a, args, nargs[call]))
		return -EFAULT;

	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
	if (err)
		return err;

	a0 = a[0];
	a1 = a[1];

	switch (call) {
	case SYS_SOCKET:
		err = sys_socket(a0, a1, a[2]);
		break;
	case SYS_BIND:
		err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
		break;
	case SYS_CONNECT:
		err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
		break;
	case SYS_LISTEN:
		err = sys_listen(a0, a1);
		break;
	case SYS_ACCEPT:
		err =
		    sys_accept(a0, (struct sockaddr __user *)a1,
			       (int __user *)a[2]);
		break;
	case SYS_GETSOCKNAME:
		err =
		    sys_getsockname(a0, (struct sockaddr __user *)a1,
				    (int __user *)a[2]);
		break;
	case SYS_GETPEERNAME:
		err =
		    sys_getpeername(a0, (struct sockaddr __user *)a1,
				    (int __user *)a[2]);
		break;
	case SYS_SOCKETPAIR:
		err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
		break;
	case SYS_SEND:
		err = sys_send(a0, (void __user *)a1, a[2], a[3]);
		break;
	case SYS_SENDTO:
		err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
				 (struct sockaddr __user *)a[4], a[5]);
		break;
	case SYS_RECV:
		err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
		break;
	case SYS_RECVFROM:
		err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
				   (struct sockaddr __user *)a[4],
				   (int __user *)a[5]);
		break;
	case SYS_SHUTDOWN:
		err = sys_shutdown(a0, a1);
		break;
	case SYS_SETSOCKOPT:
		err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
		break;
	case SYS_GETSOCKOPT:
		err =
		    sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
				   (int __user *)a[4]);
		break;
	case SYS_SENDMSG:
		err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
		break;
	case SYS_RECVMSG:
		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
		break;
	default:
		err = -EINVAL;
		break;
	}
	return err;
}


sys_socketcall()函数的参数都是由ebx和ecx来传递的。

参数call是具体的socket调用号(ebx,值为1)

参数args是指针,是ecx寄存器传递的参数数组。

由于服务器程序在用户空间,而系统调用函数在内核空间,需要将这些参数从服务器程序复制到内核中,即从用户空间复制到内核空间,复制函数为copy_from_user()

从此,流程由sys_socketcall()-->sys_socket()。

linux/net/socket.c中的sys_socket()


asmlinkage long sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock);
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

sock_create()的作用就是创建一个服务器的socket。

sock_map_fd()函数为新建的socket在网络文件系统中申请文件号和文件描述符结构。

网络文件系统

在linux/net/socket.c中定义了sock_fs_type这种fife_system_type类型的对象。

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.get_sb =	sockfs_get_sb,
	.kill_sb =	kill_anon_super,
};

见面知意,这个结构体file_system_type代表linux内核的各种文件系统,每一种文件系统必须要有自己的file_system_type机构。

由此,sock_fs_type结构定义代表sockfs的网络文件系统,但它并没有真实的物理介质,因此称为虚拟文件系统。