tcp: new list for sent but unacked skbs for RACK recovery

author Eric Dumazet <[email protected]>

Wed, 4 Oct 2017 19:59:58 +0000 (12:59 -0700)

committer David S. Miller <[email protected]>

Fri, 6 Oct 2017 04:24:47 +0000 (21:24 -0700)
author Eric Dumazet <[email protected]>
Wed, 4 Oct 2017 19:59:58 +0000 (12:59 -0700)
committer David S. Miller <[email protected]>
Fri, 6 Oct 2017 04:24:47 +0000 (21:24 -0700)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index ada821466e883b443c7ae7c0c698e1a29869d170..01a985937867935dd615d355f4a662a9f8674b83 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
   *     @nf_trace: netfilter packet trace flag
   *     @protocol: Packet protocol from driver
   *     @destructor: Destruct function
+ *     @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
   *     @_nfct: Associated connection, if any (with nfctinfo bits)
   *     @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
   *     @skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
          */
         char                    cb[48] __aligned(8);
  
-       unsigned long           _skb_refdst;
-       void                    (*destructor)(struct sk_buff *skb);
+       union {
+               struct {
+                       unsigned long   _skb_refdst;
+                       void            (*destructor)(struct sk_buff *skb);
+               };
+               struct list_head        tcp_tsorted_anchor;
+       };
+
  #ifdef CONFIG_XFRM
         struct  sec_path        *sp;
  #endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h

index 4aa40ef02d32cf2719513ca77c38f45a5e646b82..1d2c44e09e31e5a951a2ef230f4149a959e8a2e9 100644 (file)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
         u32     tsoffset;       /* timestamp offset */
  
         struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+       struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
  
         u32     snd_wl1;        /* Sequence for window update           */
         u32     snd_wnd;        /* The window we expect to receive      */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 426c2e986016abe81563e855579c301a77315741..3b16f353b539a563dae0b37328a52d67e6476f31 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1589,14 +1589,34 @@ enum tcp_chrono {
  void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
  void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
  
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
+{
+       skb->destructor = NULL;
+       skb->_skb_refdst = 0UL;
+}
+
+#define tcp_skb_tsorted_save(skb) {            \
+       unsigned long _save = skb->_skb_refdst; \
+       skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb)           \
+       skb->_skb_refdst = _save;               \
+}
+
  /* write queue abstraction */
  static inline void tcp_write_queue_purge(struct sock *sk)
  {
         struct sk_buff *skb;
  
         tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
+       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+               tcp_skb_tsorted_anchor_cleanup(skb);
                 sk_wmem_free_skb(sk, skb);
+       }
+       INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
         sk_mem_reclaim(sk);
         tcp_clear_all_retrans_hints(tcp_sk(sk));
  }
@@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
  
  static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
  {
+       list_del(&skb->tcp_tsorted_anchor);
+       tcp_skb_tsorted_anchor_cleanup(skb);
         __skb_unlink(skb, &sk->sk_write_queue);
  }
  
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index c115e37ca6083db3a4db16e77d1059222dff3672..8cf742fd4f99d7eb1bd9632afcfb09c36ba1130e 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
         tp->out_of_order_queue = RB_ROOT;
         tcp_init_xmit_timers(sk);
         INIT_LIST_HEAD(&tp->tsq_node);
+       INIT_LIST_HEAD(&tp->tsorted_sent_queue);
  
         icsk->icsk_rto = TCP_TIMEOUT_INIT;
         tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
                          * available to the caller, no more, no less.
                          */
                         skb->reserved_tailroom = skb->end - skb->tail - size;
+                       INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                         return skb;
                 }
                 __kfree_skb(skb);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index c5b8d61846c2fe762039674d2dc9a9a5a25ebd7a..fb0d7ed84b94110ee95b66befcad143505665ed5 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                 tcp_skb_pcount(skb),
                                                 skb->skb_mstamp);
                         tcp_rate_skb_delivered(sk, skb, state->rate);
+                       if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+                               list_del_init(&skb->tcp_tsorted_anchor);
  
                         if (!before(TCP_SKB_CB(skb)->seq,
                                     tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  
         shinfo = skb_shinfo(skb);
         if (!before(shinfo->tskey, prior_snd_una) &&
-           before(shinfo->tskey, tcp_sk(sk)->snd_una))
-               __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+           before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+               tcp_skb_tsorted_save(skb) {
+                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+               } tcp_skb_tsorted_restore(skb);
+       }
  }
  
  /* Remove acknowledged frames from the retransmission queue. If our packet
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c

index 188a6f31356db0d0b7825aa476888467f7c0dfd8..2341b9f857b60bf2dd7d107afcc61a68c79800c0 100644 (file)
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
  
                 INIT_LIST_HEAD(&newtp->tsq_node);
+               INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
  
                 tcp_init_wl(newtp, treq->rcv_isn);
  
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 0bc9e46a53696578eb6e911f2f75e6b34c80894f..8162e288017843fca2694c4e22b2e8981572256b 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
                       HRTIMER_MODE_ABS_PINNED);
  }
  
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+       skb->skb_mstamp = tp->tcp_mstamp;
+       list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
  /* This routine actually transmits TCP packets queued in by
   * tcp_do_sendmsg().  This is used by both the initial
   * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
                         - tp->snd_una;
                 oskb = skb;
-               if (unlikely(skb_cloned(skb)))
-                       skb = pskb_copy(skb, gfp_mask);
-               else
-                       skb = skb_clone(skb, gfp_mask);
+
+               tcp_skb_tsorted_save(oskb) {
+                       if (unlikely(skb_cloned(oskb)))
+                               skb = pskb_copy(oskb, gfp_mask);
+                       else
+                               skb = skb_clone(oskb, gfp_mask);
+               } tcp_skb_tsorted_restore(oskb);
+
                 if (unlikely(!skb))
                         return -ENOBUFS;
         }
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                 err = net_xmit_eval(err);
         }
         if (!err && oskb) {
-               oskb->skb_mstamp = tp->tcp_mstamp;
+               tcp_update_skb_after_send(tp, oskb);
                 tcp_rate_skb_sent(sk, oskb);
         }
         return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
         /* Link BUFF into the send queue. */
         __skb_header_release(buff);
         tcp_insert_write_queue_after(skb, buff, sk);
+       list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
  
         return 0;
  }
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  
                 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
                         /* "skb_mstamp" is used as a start point for the retransmit timer */
-                       skb->skb_mstamp = tp->tcp_mstamp;
+                       tcp_update_skb_after_send(tp, skb);
                         goto repair; /* Skip network transmission */
                 }
  
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
                      skb_headroom(skb) >= 0xFFFF)) {
                 struct sk_buff *nskb;
  
-               nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
-               err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-                            -ENOBUFS;
+               tcp_skb_tsorted_save(skb) {
+                       nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+                       err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+                                    -ENOBUFS;
+               } tcp_skb_tsorted_restore(skb);
+
                 if (!err)
-                       skb->skb_mstamp = tp->tcp_mstamp;
+                       tcp_update_skb_after_send(tp, skb);
         } else {
                 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
         }
@@ -3023,6 +3037,7 @@ coalesce:
                                 goto coalesce;
                         return;
                 }
+               INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                 skb_reserve(skb, MAX_TCP_HEADER);
                 sk_forced_mem_schedule(sk, skb->truesize);
                 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
         }
         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
                 if (skb_cloned(skb)) {
-                       struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+                       struct sk_buff *nskb;
+
+                       tcp_skb_tsorted_save(skb) {
+                               nskb = skb_copy(skb, GFP_ATOMIC);
+                       } tcp_skb_tsorted_restore(skb);
                         if (!nskb)
                                 return -ENOMEM;
+                       INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
                         tcp_unlink_write_queue(skb, sk);
                         __skb_header_release(nskb);
                         __tcp_add_write_queue_head(sk, nskb);
author	Eric Dumazet <[email protected]>
	Wed, 4 Oct 2017 19:59:58 +0000 (12:59 -0700)
committer	David S. Miller <[email protected]>
	Fri, 6 Oct 2017 04:24:47 +0000 (21:24 -0700)
include/linux/skbuff.h		patch \| blob \| history
include/linux/tcp.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_minisocks.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history