去年受@colyli指点,决定花些时间读一些linux kernel network部分的代码,准备把阅读代码的过程记录下来,也希望能有大牛前来指点,下面就先写一下创建socket对象的过程:
首先,创建socket需要执行socket系统调用:
int socket(int domain, int type, int protocol);该系统调用有3个参数,在内核中由SYSCALL_DEFINE3定义,具体代码如下:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}上述代码中首先做了一个类型检查,type即我们所熟知的SOCK_STREAM,SOCK_DGRAM等sock_type枚举,其取值范围为1-10,均位于type字段的低八位,而SOCK_CLOEXEC 和SOCK_NONBLOCK为于高位,SOCK_TYPE_MASK宏的值为0xF,该检查的作用是检查除了基本的socket类型和SOCK_CLOEXEC和SOCK_NONBLOCK选项之外是否设置了其它选项,如果有,则返回INVLAID。
之后的检查我理解为是一个兼容性检查,如果设置了SOCK_NONBLOCK选项,则不管SOCK_NONBLOCK的值是否定义为与O_NONBLOCK相同,均将socket的O_NONBLOCK位置位,而将SOCK_NONBLOCK位复位。
随便调用sock_create创建一个新的socket对象,之后再调用sock_map_fd将该socket对象影射为文件描述符retval,随即将其返回,从而函数得到一个socket描述符。sock_create函数直接调用了__sock_create函数来创建socket对象,具体看__sock_create函数的实现:
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO \"%s uses obsolete (PF_INET,SOCK_PACKET)\\n\",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
sock = sock_alloc();
if (!sock) {
if (net_ratelimit())
printk(KERN_WARNING \"socket: no more sockets\\n\");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
#ifdef CONFIG_MODULES
if (net_families[family] == NULL)
request_module(\"net-pf-%d\", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we\'re done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}开始先进行安全性检查和兼容性检查,security_socket_create()是个空函数,可以忽略。之后调用sock_alloc()函数在VFS上分配一个struct socket对象,所有的协议类型创建socket时创建的均为这个对象,可以理解为是所有网络层socket的模板或者说父类,上层协议栈在初始化socket时会根据这个已创建好的struct socket对象创建并初始化一个struct sock对象,这个对象包含更多上层协议栈的详细信息。
接下来的net_families数据是一个全局变量,在系统初始化时在inet_init()函数内进行初始化,其定义如下:
static const struct net_proto_family *net_families[NPROTO] __read_mostly;每个协议族都会在该数据中对应一个net_proto_family结构体,当然,未实现的协议族中对应位置的结构体指针为空,我们只关心最常用的协议族即AF_INET,其值为2,而NPROTO的值等于AF_MAX,在2.6.37内核中值为38。刚才提到该数组在inet_init()中被初始化,查看该函数相关代码可知它调用sock_register()函数注册INET协议族,代码如下:
(void)sock_register(&inet_family_ops);其中inet_family_ops是一个全局静态变量,其定义如下:
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create, /* 该函数在我们创建socket的过程中起着很关键的作用 */
.owner = THIS_MODULE,
};再继续查看sock_register()函数,其代码如下:
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT \"protocol %d >= NPROTO(%d)\\n\", ops->family,
NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (net_families[ops->family])
err = -EEXIST;
else {
net_families[ops->family] = ops;
err = 0;
}
spin_unlock(&net_family_lock);
printk(KERN_INFO \"NET: Registered protocol family %d\\n\", ops->family);
return err;
}代码很简单,即将net_proto_family对象插入到net_families数据对应的位置中,由此来完成对net_families的初始化。
OK,继续返回刚才的__sock_create()函数,看下面的代码:
#ifdef CONFIG_MODULES
if (net_families[family] == NULL)
request_module(\"net-pf-%d\", family);
#endif如果编译时支持可安装模块,则首先检测net_families数组中对应的family是否存在,假设此处为AF_INET,如果不存在,即在系统初始化时没有通过sock_register()函数对该元素进行注册,则调用request_module()动态地安装模块net-pf-2。
再往下通过RCU机制获取net_families中对应的net_proto_family对象,RCU机制主要用于网络层和VFS中,关于RCU机制的更多细节可以参考@gnawux师兄的译文:What is RCU, Fundamentally?
之后由注释可知增加该模块的引用计数,重要的函数为pf->create(),当协议族为AF_INET时,由以上可知,create函数为inet_create(),由这个函数进一步创建该socket,我们再看inet_create()函数的实现:
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
/* 检查是否有加密字符串,如果没有则查检socket类型,
只有TCP需要加密字符串,如果协议类型不是SOCK_RAW和SOCK_DGRAM的话,
调用build_ehash_secret()函数创建一个加密字符串 */
if (unlikely(!inet_ehash_secret))
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
build_ehash_secret();
/* 将socket状态设置为未连接 */
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
/* 遍历inetsw数组对应请求类型的链表元素,
如果protocol不是通配符类型,即IPPROTO_IP,
该宏的值为0,则在链表中搜索该协议类型,
如未找到则返回协议类型不支持,如果是通配符类型,
则选择一个适合的协议类型,至于inetsw数组,见下面分析 */
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
/* 动态加载相关协议模块 */
if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module(\"net-pf-%d-proto-%d-type-%d\",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module(\"net-pf-%d-proto-%d\",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
/* 众所周知,只有root权限用户才可以创始原始套接字,
此处检查是否有权限创建SOCK_RAW类型的套接字,
如果无权限,则返回-EPERM */
if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
err = -EAFNOSUPPORT;
/* 检查协议类型是否支持 */
if (!inet_netns_ok(net, protocol))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(answer_prot->slab == NULL);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = 1;
inet = inet_sk(sk);
/* 是否是基于连接的socket,目前只有SOCK_STREAM是基于连接的socket */
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
/* 如果是原始套接字,则需要创建IP头部 */
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
/* 是否启用路径MTU发现机制,可以在修改proc文件开启或关闭:
/proc/sys/net/ipv4/ip_no_pmtu_disc */
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
/* 该函数利用sock的内容进一步初始化sk */
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}说一下inetsw这个数组,它也是针对于AF_INET协议族而言的,是一个全局静态变量,包含着创建一个新的socket所需要的所有信息,定义如下:
static struct list_head inetsw[SOCK_MAX];其中每个元素都是一个双向链表,该数据的定义也是在inet_init()函数中完成的,看其中相关代码:
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);首先初始化每个链表元素的头,接下来将inetsw_array这个数组中的元素使用inet_register_protosw()函数注册到inetsw数组中,inetsw_array也是一个全局的静态变量,其定义如下:
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};它包含了各种协议类型所需要的基本信息。
接下来很重要的一步,就是调用sk_alloc()函数创建一个新的struct sock对象,struct socket和struct sock的区别在于,struct socket是创建每个BSD所必须的,它描述BSD socket的一些基本信息,而struct sock则描述网络层的相关信息,它在struct socket的基础上构建。看一下sk_sock()的定义:
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
sock_net_set(sk, get_net(net));
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
}
return sk;
}最后一个参数为当前协议类型对应的struct proto对象,将刚创建的struct sock对象的sk_prot和sk_prot_creator成员初始化为prot。sk_prot_alloc()函数用于创建sock对象,可以在slab分配器上创建,也可以在普通缓存中创建。
接下来贝sock_init_data()函数进一步初始化sk:
void sock_init_data(struct socket *sock, struct sock *sk)
{
/* 初始化接收/发送/异常缓冲队列,这些队列均为双向链表,
节点数据内容为struct sk_buff对象, 各种数据包的信息都
存放在该结构体中。队列sk_error_queue很少使用 */
skb_queue_head_init(&sk->sk_receive_queue);
skb_queue_head_init(&sk->sk_write_queue);
skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer);
sk->sk_allocation = GFP_KERNEL;
/* 接收缓冲区最大字节数 */
sk->sk_rcvbuf = sysctl_rmem_default;
/* 发送缓冲区最大字节数 */
sk->sk_sndbuf = sysctl_wmem_default;
/* 连接状态,SOCK_DGRAM和SOCK_RAW也会共用TCP的一些状态,
连接刚建立的时候都会使用TCP_CLOSE状态 */
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
} else
sk->sk_wq = NULL;
spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}在inet_create()函数的最后还有一个很重要的初始化,下面这段代码:
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}前面在sk_alloc()函数中为sk初始化时将协议类型对应的proto对象指针赋给了sk->sk_prot,这里的init()函数即对应的proto中的init函数,当协议类型为SOCK_STREAM时,prot即为tcp_proto(见inetsw_array), 对应的init()函数即为:
static int tcp_v4_init_sock(struct sock *sk)