Linux内核--网络栈实现分析（十）--网络层之IP协议（下）

thecloud

浏览: 880561 次

最近访客更多访客>>

Ani521smile

song0394

空空儿

aaron198

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (1953)

社区版块

存档分类

2013-10 ( 13)
2013-09 ( 38)
2013-08 ( 75)
更多存档...

本文分析基于Linux Kernel 1.2.13

原创作品，转载请标明http://blog.csdn.net/yming0221/article/details/7552455

更多请查看专栏，地址http://blog.csdn.net/column/details/linux-kernel-net.html

作者：闫明

注：标题中的”（上）“，”（下）“表示分析过程基于数据包的传递方向：”（上）“表示分析是从底层向上分析、”（下）“表示分析是从上向下分析。

上篇博文分析传输层最终会调用函数ip_queue_xmit()函数，将发送数据的任务交给网络层，下面就分析了下该函数：

该函数的主要函数调用关系图如下：

/*
 * Queues a packet to be sent, and starts the transmitter
 * if necessary.  if free = 1 then we free the block after
 * transmit, otherwise we don't. If free==2 we not only
 * free the block but also don't assign a new ip seq number.
 * This routine also needs to put in the total length,
 * and compute the checksum
 */

void ip_queue_xmit(struct sock *sk, //发送数据的队列所对应的sock结构
					  struct device *dev,//发送该数据包的网卡设备
	      			  struct sk_buff *skb,//封装好的sk_buff结构，要发送的数据在该结构中
	      			  int free)//主要配合TCP协议使用，用于数据包的重发，UDP等协议调用是free=1
{
	struct iphdr *iph;//IP数据报首部指针
	unsigned char *ptr;

	/* Sanity check */
	if (dev == NULL)
	{
		printk("IP: ip_queue_xmit dev = NULL\n");
		return;
	}

	IS_SKB(skb);

	/*
	 *	Do some book-keeping in the packet for later
	 */


	skb->dev = dev;//进一步完整sk_buff的相应字段
	skb->when = jiffies;//用于TCP协议的超时重传

	/*
	 *	Find the IP header and set the length. This is bad
	 *	but once we get the skb data handling code in the
	 *	hardware will push its header sensibly and we will
	 *	set skb->ip_hdr to avoid this mess and the fixed
	 *	header length problem
	 */

	ptr = skb->data;//指针指向sk_buff中的数据部分
	ptr += dev->hard_header_len;//hard_header_len为硬件首部长度，在net_init.c的函数eth_setup()函数中设置的，dev->hard_header_len = ETH_HLEN; 以太网首部长度为14
	iph = (struct iphdr *)ptr;//prt已经指向IP数据包的首部
	skb->ip_hdr = iph;
	iph->tot_len = ntohs(skb->len-dev->hard_header_len);//计算IP数据报的总长度

#ifdef CONFIG_IP_FIREWALL
	if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
		/* just don't send this packet */
		return;
#endif	

	/*
	 *	No reassigning numbers to fragments...
	 */

	if(free!=2)
		iph->id      = htons(ip_id_count++);
	else
		free=1;

	/* All buffers without an owner socket get freed */
	if (sk == NULL)
		free = 1;

	skb->free = free;//设置skb的free值，free=1，发送后立即释放；free=2，不但释放缓存，而且不分配新的序列号

	/*
	 *	Do we need to fragment. Again this is inefficient.
	 *	We need to somehow lock the original buffer and use
	 *	bits of it.
	 */
	//数据帧中的数据部分必须小于等于MTU
	if(skb->len > dev->mtu + dev->hard_header_len)//发送的数据长度大于数据帧的数据部分和帧首部之和，则需要分片
	{
		ip_fragment(sk,skb,dev,0);//对数据报分片后继续调用ip _queue_xmit()函数发送数据
		IS_SKB(skb);
		kfree_skb(skb,FREE_WRITE);
		return;
	}

	/*
	 *	Add an IP checksum
	 */

	ip_send_check(iph);//IP数据报首部检查

	/*
	 *	Print the frame when debugging
	 */

	/*
	 *	More debugging. You cannot queue a packet already on a list
	 *	Spot this and moan loudly.
	 */
	if (skb->next != NULL)//说明该数据包仍然存在于某个缓存队列
	{
		printk("ip_queue_xmit: next != NULL\n");
		skb_unlink(skb);//将其从缓存链表中删除，否则可能导致内核错误
	}

	/*
	 *	If a sender wishes the packet to remain unfreed
	 *	we add it to his send queue. This arguably belongs
	 *	in the TCP level since nobody else uses it. BUT
	 *	remember IPng might change all the rules.
	 */

	if (!free)//free=0
	{
		unsigned long flags;
		/* The socket now has more outstanding blocks */

		sk->packets_out++;

		/* Protect the list for a moment */
		save_flags(flags);
		cli();

		if (skb->link3 != NULL)//link3指向数据报道呃重发队列
		{
			printk("ip.c: link3 != NULL\n");
			skb->link3 = NULL;
		}
		//sk中send_tail和send_head是用户缓存的单向链表表尾和表头
		if (sk->send_head == NULL)
		{
			sk->send_tail = skb;
			sk->send_head = skb;
		}
		else
		{
			sk->send_tail->link3 = skb;//link3指针用于数据包的连接
			sk->send_tail = skb;
		}
		/* skb->link3 is NULL */

		/* Interrupt restore */
		restore_flags(flags);
	}
	else
		/* Remember who owns the buffer */
		skb->sk = sk;

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
	 
	ip_statistics.IpOutRequests++;
#ifdef CONFIG_IP_ACCT
	ip_acct_cnt(iph,dev, ip_acct_chain);
#endif	
	
#ifdef CONFIG_IP_MULTICAST	//这部分是IP数据报的多播处理

	/*
	 *	Multicasts are looped back for other local users
	 */
	 
	.......................................
#endif
	if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))//广播数据包的处理
		ip_loopback(dev,skb);
		
	if (dev->flags & IFF_UP)//设备状态正常
	{
		/*
		 *	If we have an owner use its priority setting,
		 *	otherwise use NORMAL
		 */
		//调用设备接口层函数发送数据: dev_queue_xmit()函数
		if (sk != NULL)
		{
			dev_queue_xmit(skb, dev, sk->priority);
		}
		else
		{
			dev_queue_xmit(skb, dev, SOPRI_NORMAL);
		}
	}
	else//设备状态不正常
	{
		ip_statistics.IpOutDiscards++;
		if (free)
			kfree_skb(skb, FREE_WRITE);
	}
}

这个函数中对长度过长的数据包进行了分片，ip_fragment()函数，该函数没有详细分析。

void ip_fragment(struct sock *sk, struct sk_buff *skb, struct device *dev, int is_frag)
{
	struct iphdr *iph;
	unsigned char *raw;
	unsigned char *ptr;
	struct sk_buff *skb2;
	int left, mtu, hlen, len;
	int offset;
	unsigned long flags;

	/*
	 *	Point into the IP datagram header.
	 */

	raw = skb->data;
	iph = (struct iphdr *) (raw + dev->hard_header_len);

	skb->ip_hdr = iph;

	/*
	 *	Setup starting values.
	 */

	hlen = (iph->ihl * sizeof(unsigned long));
	left = ntohs(iph->tot_len) - hlen;	/* Space per frame */
	hlen += dev->hard_header_len;		/* Total header size */
	mtu = (dev->mtu - hlen);		/* Size of data space */
	ptr = (raw + hlen);			/* Where to start from */

	/*
	 *	Check for any "DF" flag. [DF means do not fragment]
	 */

	if (ntohs(iph->frag_off) & IP_DF)
	{
		/*
		 *	Reply giving the MTU of the failed hop.
		 */
		ip_statistics.IpFragFails++;
		icmp_send(skb,ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, dev->mtu, dev);
		return;
	}

	/*
	 *	The protocol doesn't seem to say what to do in the case that the
	 *	frame + options doesn't fit the mtu. As it used to fall down dead
	 *	in this case we were fortunate it didn't happen
	 */

	if(mtu<8)
	{
		/* It's wrong but it's better than nothing */
		icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev);
		ip_statistics.IpFragFails++;
		return;
	}

	/*
	 *	Fragment the datagram.
	 */

	/*
	 *	The initial offset is 0 for a complete frame. When
	 *	fragmenting fragments it's wherever this one starts.
	 */

	if (is_frag & 2)
		offset = (ntohs(iph->frag_off) & 0x1fff) << 3;
	else
		offset = 0;


	/*
	 *	Keep copying data until we run out.
	 */

	while(left > 0)
	{
		len = left;
		/* IF: it doesn't fit, use 'mtu' - the data space left */
		if (len > mtu)
			len = mtu;
		/* IF: we are not sending upto and including the packet end
		   then align the next start on an eight byte boundary */
		if (len < left)
		{
			len/=8;
			len*=8;
		}
		/*
		 *	Allocate buffer.
		 */

		if ((skb2 = alloc_skb(len + hlen,GFP_ATOMIC)) == NULL)
		{
			printk("IP: frag: no memory for new fragment!\n");
			ip_statistics.IpFragFails++;
			return;
		}

		/*
		 *	Set up data on packet
		 */

		skb2->arp = skb->arp;
		if(skb->free==0)
			printk("IP fragmenter: BUG free!=1 in fragmenter\n");
		skb2->free = 1;
		skb2->len = len + hlen;
		skb2->h.raw=(char *) skb2->data;
		/*
		 *	Charge the memory for the fragment to any owner
		 *	it might possess
		 */

		save_flags(flags);
		if (sk)
		{
			cli();
			sk->wmem_alloc += skb2->mem_len;
			skb2->sk=sk;
		}
		restore_flags(flags);
		skb2->raddr = skb->raddr;	/* For rebuild_header - must be here */

		/*
		 *	Copy the packet header into the new buffer.
		 */

		memcpy(skb2->h.raw, raw, hlen);

		/*
		 *	Copy a block of the IP datagram.
		 */
		memcpy(skb2->h.raw + hlen, ptr, len);
		left -= len;

		skb2->h.raw+=dev->hard_header_len;

		/*
		 *	Fill in the new header fields.
		 */
		iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/);
		iph->frag_off = htons((offset >> 3));
		/*
		 *	Added AC : If we are fragmenting a fragment thats not the
		 *		   last fragment then keep MF on each bit
		 */
		if (left > 0 || (is_frag & 1))
			iph->frag_off |= htons(IP_MF);
		ptr += len;
		offset += len;

		/*
		 *	Put this fragment into the sending queue.
		 */

		ip_statistics.IpFragCreates++;

		ip_queue_xmit(sk, dev, skb2, 2);//还是调用ip_queue_xmit()函数来发送分片后的数据
	}
	ip_statistics.IpFragOKs++;
}

网络层的发送函数调用了设备接口层，相当于网络模型的链路层的发送函数dev_queue_xmit()

该函数的调用关系如下：

/*
 *	Send (or queue for sending) a packet. 
 *
 *	IMPORTANT: When this is called to resend frames. The caller MUST
 *	already have locked the sk_buff. Apart from that we do the
 *	rest of the magic.
 */

void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri)
{
	unsigned long flags;
	int nitcount;
	struct packet_type *ptype;
	int where = 0;		/* used to say if the packet should go	*/
				/* at the front or the back of the	*/
				/* queue - front is a retransmit try	*/
				/* where=0 表示是刚从上层传递的新数据包；where=1 表示从硬件队列中取出的数据包*/

	if (dev == NULL) 
	{
		printk("dev.c: dev_queue_xmit: dev = NULL\n");
		return;
	}
	
	if(pri>=0 && !skb_device_locked(skb))//锁定该skb再进行操作，避免造成内核的不一致情况
		skb_device_lock(skb);	/* Shove a lock on the frame */
#ifdef CONFIG_SLAVE_BALANCING
	save_flags(flags);
	cli();
	if(dev->slave!=NULL && dev->slave->pkt_queue < dev->pkt_queue &&
				(dev->slave->flags & IFF_UP))
		dev=dev->slave;
	restore_flags(flags);
#endif		
#ifdef CONFIG_SKB_CHECK 
	IS_SKB(skb);
#endif    
	skb->dev = dev;

	/*
	 *	This just eliminates some race conditions, but not all... 
	 */

	if (skb->next != NULL) //这种条件似乎永远不能成立，因为发送数据包前，数据包已经从缓存队列摘下
	{//以防内核代码有BUG
		/*
		 *	Make sure we haven't missed an interrupt. 
		 */
		printk("dev_queue_xmit: worked around a missed interrupt\n");
		start_bh_atomic();
		dev->hard_start_xmit(NULL, dev);
		end_bh_atomic();
		return;
  	}

	/*
	 *	Negative priority is used to flag a frame that is being pulled from the
	 *	queue front as a retransmit attempt. It therefore goes back on the queue
	 *	start on a failure.
	 */
	 
  	if (pri < 0) //优先级小于0表示是从硬件队列中取出的数据包
  	{
		pri = -pri-1;
		where = 1;
  	}

	if (pri >= DEV_NUMBUFFS) 
	{
		printk("bad priority in dev_queue_xmit.\n");
		pri = 1;
	}

	/*
	 *	If the address has not been resolved. Call the device header rebuilder.
	 *	This can cover all protocols and technically not just ARP either.
	 */
	 
	if (!skb->arp && dev->rebuild_header(skb->data, dev, skb->raddr, skb)) {//用于ARP协议，并重建MAC帧首部
		return;
	}

	save_flags(flags);
	cli();	
	if (!where) {//表示是新数据包，需要将其加入设备队列中
#ifdef CONFIG_SLAVE_BALANCING	
		skb->in_dev_queue=1;//该数据包在设备队列
#endif		
		skb_queue_tail(dev->buffs + pri,skb);//将发送数据包加入硬件队列
		skb_device_unlock(skb);		/* Buffer is on the device queue and can be freed safely */
		skb = skb_dequeue(dev->buffs + pri);//从硬件队列中取出一个数据包
		skb_device_lock(skb);		/* New buffer needs locking down */
#ifdef CONFIG_SLAVE_BALANCING		
		skb->in_dev_queue=0;
#endif		
	}
	restore_flags(flags);

	/* copy outgoing packets to any sniffer packet handlers */
	if(!where)//对于新的数据包，则遍历网络层协议队列，内核支持混杂模式
	{
		for (nitcount= dev_nit, ptype = ptype_base; nitcount > 0 && ptype != NULL; ptype = ptype->next) 
		{
			/* Never send packets back to the socket
			 * they originated from - MvS (miquels@drinkel.ow.org)
			 */
			if (ptype->type == htons(ETH_P_ALL) &&
			   (ptype->dev == dev || !ptype->dev) &&
			   ((struct sock *)ptype->data != skb->sk))
			{
				struct sk_buff *skb2;
				if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL)
					break;
				/*
				 *	The protocol knows this has (for other paths) been taken off
				 *	and adds it back.
				 */
				skb2->len-=skb->dev->hard_header_len;
				ptype->func(skb2, skb->dev, ptype);//IP层函数对应func为ip_rcv()，将发送的数据回送一份给对应的网络层协议
				nitcount--;//用于及时退出循环
			}
		}
	}
	start_bh_atomic();//开始原子操作
	if (dev->hard_start_xmit(skb, dev) == 0) {//调用硬件的发送函数发送数据
		end_bh_atomic();//结束原子操作
		/*
		 *	Packet is now solely the responsibility of the driver
		 */
		return;//到这里说明数据包成功发送
	}
	//数据包没有成功发送，进行处理，将数据包从新加入硬件队列
	end_bh_atomic();

	/*
	 *	Transmission failed, put skb back into a list. Once on the list it's safe and
	 *	no longer device locked (it can be freed safely from the device queue)
	 */
	cli();
#ifdef CONFIG_SLAVE_BALANCING
	skb->in_dev_queue=1;
	dev->pkt_queue++;
#endif		
	skb_device_unlock(skb);//对SKB解锁
	skb_queue_head(dev->buffs + pri,skb);//这次采用头插法插入硬件发送队列
	restore_flags(flags);
}

具体的硬件发送函数dev->hard_start_xmit的实现将做下篇博文中分析。

分享到：