本文使用源码版本: linux 2.6.26、glibc 2.3.6。


通过 glibc 提供的 API 接口 socket() 可以创建一个 socket。在服务器端执行如下代码:

int fd = socket(AF_INET, SOCK_STREAM, 0);

通过这个 API 进入 glibc/sysdeps/unix/sysv/linux/i386:

ENTRY (__socket)
#if defined NEED_CANCELLATION && defined CENABLE
	SINGLE_THREAD_P
	jne 1f
#endif

	/* Save registers.  */
	movl %ebx, %edx
	cfi_register (3, 2)

	movl $SYS_ify(socketcall), %eax	/* System call number in %eax.  */

	/* Use ## so `socket' is a separate token that might be #define'd.  */
	movl $P(SOCKOP_,socket), %ebx	/* Subcode is first arg to syscall.  */
	lea 4(%esp), %ecx		/* Address of args is 2nd arg.  */

        /* Do the system call trap.  */
	ENTER_KERNEL

在进入内核前的一段代码,只能用汇编来实现。这里严格按照 linux 系统调用的规则对相应寄存器赋值。将系统调用号 SYS_ify(socketcall) = 102 放进 eax。将参数 P(SOCKOP_,socket) = 1 放进 ebx(后面会用到)。执行 lea 4(%esp), %ecx 是将 socket() 的参数指针放进 ecx,见下图(遵循GCC调用规约):

+-------------+
|             |
|             |
+-------------+
|    0        |
+-------------+
| SOCK_STREAM |
+-------------+
| AF_INET     |
+-------------+ <--+ esp+4/ecx
| ret address |
+-------------+ <--+ esp
|             |
|             |
+-------------+

最终执行 ENTER_KERNEL(int 0x80) 宏进入内核。此处省略描述系统调用的步骤,例如用户栈/内核栈切换、寄存器压栈、copy_from_user参数复制等等。最终到达 socket 通信总入口:

asmlinkage long 
sys_socketcall(int call, unsigned long __user *args)
{
    ...	
	switch(call) 
	{
		case SYS_SOCKET:
			err = sys_socket(a0,a1,a[2]);
			break;
        ...
		case SYS_ACCEPT:
			err = sys_accept(a0,(struct sockaddr __user *)a1, (int __user *)a[2]);
			break;
        ...
	}
	return err;
}

这里 call = 1,即前面 ebx 寄存器中的内容,case 选择到 sys_socket():

asmlinkage long 
sys_socket(int family, int type, int protocol)
{
	int retval;
	struct socket *sock;
	retval = sock_create(family, type, protocol, &sock);
    ...
	retval = sock_map_fd(sock);
    ...
	return retval;
}

从这里开始执行两个重量级的函数入口: sock_create()sock_map_fd()。下面分别介绍。

sock_create()

int sock_create(int family, 
                int type, 
                int protocol, 
                struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, 
                         family, 
                         type, 
                         protocol, 
                         res, 
                         0);
}

这里只是一个简单的函数封装,接着执行 __sock_create():

static int __sock_create(struct net *net, 
                         int family, 
                         int type, 
                         int protocol,
			             struct socket **res, 
                         int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;
    ...
	sock = sock_alloc();
    ...
	sock->type = type;
    ...
	pf = rcu_dereference(net_families[family]);
    ...
	err = pf->create(net, sock, protocol);
    ...
	*res = sock;
    ...
	return err;
    ...
}

这里省略了很多的语句,只留下主干来介绍。首先是 sock_alloc() 函数,这个函数产生一个特殊的调用链:

+-----------------------------+
| new_inode(sock_mnt->mnt_sb) |
+--------------+--------------+
               |
               v
      +--------+--------+
      | alloc_inode(sb) |
      +--------+--------+
               |
               v
 +-------------+-------------+
 | sb->s_op->alloc_inode(sb) |
 +---------------------------+
if (sb->s_op->alloc_inode)
  inode=sb->s_op->alloc_inode(sb);
else
  inode=(struct inode *)kmem_cache_alloc(inode_cachep,GFP_KERNEL);

由于 linux 在初始化后,会安装一个虚拟的网络文件系统。这里采用了面向对象的思想,通过传入网络文件系统的 superblock(超级块)执行了这个操作,执行了指定超级块的操作表中的函数,网络文件系统超级块的函数操作表定义如下:

static struct super_operations sockfs_ops = {
	.alloc_inode =	sock_alloc_inode,
	.destroy_inode =sock_destroy_inode,
	.statfs =	simple_statfs,
};

所以这里执行 sb->s_op->alloc_inode(sb) 语句会执行 sock_alloc_inode() 函数:

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
	if (!ei)
		return NULL;
	init_waitqueue_head(&ei->socket.wait);
	ei->socket.fasync_list = NULL;
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;
	return &ei->vfs_inode;
}

这个函数执行 kmem_cache_alloc() 从专用 slab 高速缓存(slab是linux中内存管理的一种机制)分配一个 struct socket_alloc 结构体:

struct socket_alloc {
	struct socket socket;
	struct inode vfs_inode;
};

这个结构体中的成员 socket 就是 sock_alloc() 函数的返回值。接着再回到 __sock_create():

static int __sock_create(struct net *net, 
                         int family, 
                         int type, 
                         int protocol,
			             struct socket **res, 
                         int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;
    ...
	sock = sock_alloc();
    ...
	sock->type = type;
    ...
	pf = rcu_dereference(net_families[family]);
    ...
	err = pf->create(net, sock, protocol);
    ...
	*res = sock;
    ...
	return err;
    ...
}

将用户态传来的 type 赋给刚分配的 socket,表明这个 socket 是一个 SOCK_STREAM(流式套接字)。接着下面两条语句,又采用了面向对象的思想。netfamilies 是一个全局的变量,这个变量在网络系统初始化的时候就已经赋值了,这里我们只关心 AF_INET(IPV4) 协议族,下面是这个协议族的操作表:

static struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

所以 pf->create(net, sock, protocol) 语句会执行 inet_create() 函数。在这之前,先介绍一个全局数组 inetsw[]:

static struct inet_protosw inetsw[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.capability = -1,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},
	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.capability = -1,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
    },
    {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .capability = CAP_NET_RAW,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
    }
};

这个数组包括了传输层的一些信息,我们这里用到了数组中第一个元素,在下面的函数中也能得到验证:

static int inet_create(struct net *net, 
                       struct socket *sock, 
                       int protocol)
{
	struct sock *sk;
	struct list_head *p;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;

	if (sock->type != SOCK_RAW &&
	    sock->type != SOCK_DGRAM &&
	    !inet_ehash_secret)
		build_ehash_secret(); // 创建一个 32 位的全局随机数
                              // 赋值给 inet_ehash_secret
	sock->state = SS_UNCONNECTED; // socket 的状态为未连接
    ...
	list_for_each_rcu(p, &inetsw[sock->type]) {
        // 这里 answer 指向 inetsw 数组中第一个元素
		answer = list_entry(p, struct inet_protosw, list);
    ...
		if (IPPROTO_IP == protocol) {
            // 如果用户态穿过来的 protocol 为 0
            // 将其设置为 IPPROTO_TCPs
			protocol = answer->protocol;
			break;
		}
    ...
	}
    ...
	sock->ops = answer->ops; 
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
    ...
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
    ...
	sk->sk_no_check = answer_no_check;
    ...
	inet = inet_sk(sk);
    ...
	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_family	   = PF_INET;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;

	sk_refcnt_debug_inc(sk);

	if (inet->num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->sport = htons(inet->num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}

	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
	return err;
    ...
}

这个函数里面多了两个新面孔,一个是 struct sock,另一个是 struct inet_sock。它们是通过 sk = sk_alloc(net,PF_INET,GFP_KERNEL,answer_prot); 这条语句来分配的。和前面一样,也是从 slab 中分配。为了后面的理解,这里跟踪了这个 slab 的创建:

static int __init inet_init(void)
{
    ...
	rc = proto_register(&tcp_prot, 1);
    ...
	rc = proto_register(&udp_prot, 1);
    ...
	rc = proto_register(&raw_prot, 1);
    ...
}

int proto_register(struct proto *prot, int alloc_slab)
{
    ...
	prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
				       SLAB_HWCACHE_ALIGN, NULL);
    ...
}

在 proto_register() 这个函数里主要是创建一些 slab 高速缓存。其中与我们这里相关的就是 tcp_prot->slab 这个 slab。这个 slab 里的对象大小为 tcp_prot->obj_size,下面贴上 tcp_prot 看看这个结构体里到底是什么内容:

struct proto tcp_prot = {
	.name			= "TCP",
    ...
	.obj_size		= sizeof(struct tcp_sock),
    ...
};

可以看到这个 slab 里是 struct tcp_sock 对象。那为什么在 inet_create() 中能用 struct sock 这个变量来接收 tcp_sock 对象呢?这是因为有这个嵌套关系:

struct tcp_sock {
	struct inet_connection_sock	inet_conn;
    ...
}
struct inet_connection_sock {
	struct inet_sock	  icsk_inet;
    ...
}
struct inet_sock {
	struct sock		sk;
    ...
}

分配 tcp_sock 后,后面就剩下一些初始化的工作了。主要是对 sock 和 inet_sock 的初始化。在 inet_create() 里还调用了两个函数:

受篇幅限制,这里就不展开介绍了,可以自行对照着上面的链接和 struct sockstruct inet_sock 着两个结构体的成员,看看分别做了哪些初始化。

sock_map_fd()

这个函数里主要的工作就是在这个 socket 对应的文件索引节点上,挂载上与 socket 相关的操作。在 sock_map_fd() 里面调用下面这个函数:

static int sock_attach_fd(struct socket *sock, struct file *file)
{
	struct dentry *dentry;
	struct qstr name = { .name = "" };

	dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name);
	if (unlikely(!dentry))
		return -ENOMEM;

	dentry->d_op = &sockfs_dentry_operations;
	dentry->d_flags &= ~DCACHE_UNHASHED;
	d_instantiate(dentry, SOCK_INODE(sock));
	sock->file = file;
	init_file(file, sock_mnt, dentry, FMODE_READ | FMODE_WRITE,
		  &socket_file_ops);
	SOCK_INODE(sock)->i_fop = &socket_file_ops;
	file->f_flags = O_RDWR;
	file->f_pos = 0;
	file->private_data = sock;
	return 0;
}

关键操作在 SOCK_INODE(sock)->i_fop = &socket_file_ops,socket_file_ops 结构体如下:

static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.aio_read =	sock_aio_read,
	.aio_write =	sock_aio_write,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
    ...
	.mmap =		sock_mmap,
	.open =		sock_no_open,
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
};

下面摘录一句源码注释:

Socket files have a set of ‘special’ operations as well as the generic file ones. These don’t appear in the operation structures but are done directly via the socketcall()multiplexor.

通过这个挂载,我们就能使用 read() write() 等系统调用操作 socket 文件了。

最后将通过 sock_alloc_fd() 分配的 fd 返回给用户空间(存放在 eax 中返回)。