-
Notifications
You must be signed in to change notification settings - Fork 0
/
0041-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch
2875 lines (2768 loc) · 107 KB
/
0041-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
From 0a9cc70776e594e183285ce455eadc1a0c9cabba Mon Sep 17 00:00:00 2001
From: Neal Cardwell <[email protected]>
Date: Tue, 11 Jun 2019 12:54:22 -0400
Subject: [PATCH 16/18] net-tcp_bbr: v3: update TCP "bbr" congestion control
module to BBRv3
BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower
queues, lower loss, and better Reno/CUBIC coexistence than BBR v1.
BBR v3 maintains the core of BBR v1: an explicit model of the network
path that is two-dimensional, adapting to estimate the (a) maximum
available bandwidth and (b) maximum safe volume of data a flow can
keep in-flight in the network. It maintains the estimated BDP as a
core guide for estimating an appropriate level of in-flight data.
BBR v3 makes several key enhancements:
o Its bandwidth-probing time scale is adapted, within bounds, to allow improved
coexistence with Reno and CUBIC. The bandwidth-probing time scale is (a)
extended dynamically based on estimated BDP to improve coexistence with
Reno/CUBIC; (b) bounded by an interactive wall-clock time-scale to be more
scalable and responsive than Reno and CUBIC.
o Rather than being largely agnostic to loss and ECN marks, it explicitly uses
loss and (DCTCP-style) ECN signals to maintain its model.
o It aims for lower losses than v1 by adjusting its model to attempt to stay
within loss rate and ECN mark rate bounds (loss_thresh and ecn_thresh,
respectively).
o It adapts to loss/ECN signals even when the application is running out of
data ("application-limited"), in case the "application-limited" flow is also
"network-limited" (the bw and/or inflight available to this flow is lower than
previously estimated when the flow ran out of data).
o It has a three-part model: the model explicit three tracks operating points,
where an operating point is a tuple: (bandwidth, inflight). The three operating
points are:
o latest: the latest measurement from the current round trip
o upper bound: robust, optimistic, long-term upper bound
o lower bound: robust, conservative, short-term lower bound
These are stored in the following state variables:
o latest: bw_latest, inflight_latest
o lo: bw_lo, inflight_lo
o hi: bw_hi[2], inflight_hi
To gain intuition about the meaning of the three operating points, it
may help to consider the analogs in CUBIC, which has a somewhat
analogous three-part model used by its probing state machine:
BBR param CUBIC param
----------- -------------
latest ~ cwnd
lo ~ ssthresh
hi ~ last_max_cwnd
The analogy is only a loose one, though, since the BBR operating
points are calculated differently, and are 2-dimensional (bw,inflight)
rather than CUBIC's one-dimensional notion of operating point
(inflight).
o It uses the three-part model to adapt the magnitude of its bandwidth
to match the estimated space available in the buffer, rather than (as
in BBR v1) assuming that it was always acceptable to place 0.25*BDP in
the bottleneck buffer when probing (commodity datacenter switches
commonly do not have that much buffer for WAN flows). When BBR v3
estimates it hit a buffer limit during probing, its bandwidth probing
then starts gently in case little space is still available in the
buffer, and the accelerates, slowly at first and then rapidly if it
can grow inflight without seeing congestion signals. In such cases,
probing is bounded by inflight_hi + inflight_probe, where
inflight_probe grows as: [0, 1, 2, 4, 8, 16,...]. This allows BBR to
keep losses low and bounded if a bottleneck remains congested, while
rapidly/scalably utilizing free bandwidth when it becomes available.
o It has a slightly revised state machine, to achieve the goals above.
BBR_BW_PROBE_UP: pushes up inflight to probe for bw/vol
BBR_BW_PROBE_DOWN: drain excess inflight from the queue
BBR_BW_PROBE_CRUISE: use pipe, w/ headroom in queue/pipe
BBR_BW_PROBE_REFILL: try refill the pipe again to 100%, leaving queue empty
o The estimated BDP: BBR v3 continues to maintain an estimate of the
path's two-way propagation delay, by tracking a windowed min_rtt, and
coordinating (on an as-ndeeded basis) to try to expose the two-way
propagation delay by draining the bottleneck queue.
BBR v3 continues to use its min_rtt and (currently-applicable) bandwidth
estimate to estimate the current bandwidth-delay product. The estimated BDP
still provides one important guideline for bounding inflight data. However,
because any min-filtered RTT and max-filtered bw inherently tend to both
overestimate, the estimated BDP is often too high; in this case loss or ECN
marks can ensue, in which case BBR v3 adjusts inflight_hi and inflight_lo to
adapt its sending rate and inflight down to match the available capacity of the
path.
o Space: Note that ICSK_CA_PRIV_SIZE increased. This is because BBR v3
requires more space. Note that much of the space is due to support for
per-socket parameterization and debugging in this release for research
and debugging. With that state removed, the full "struct bbr" is 140
bytes, or 144 with padding. This is an increase of 40 bytes over the
existing ca_priv space.
o Code: BBR v3 reuses many pieces from BBR v1. But it omits the following
significant pieces:
o "packet conservation" (bbr_set_cwnd_to_recover_or_restore(),
bbr_can_grow_inflight())
o long-term bandwidth estimator ("policer mode")
The code layout tries to keep BBR v3 code near the bottom of the
file, so that v1-applicable code in the top does not accidentally
refer to v3 code.
o Docs:
See the following docs for more details and diagrams decsribing the BBR v3
algorithm:
https://datatracker.ietf.org/meeting/104/materials/slides-104-iccrg-an-update-on-bbr-00
https://datatracker.ietf.org/meeting/102/materials/slides-102-iccrg-an-update-on-bbr-work-at-google-00
o Internal notes:
For this upstream rebase, Neal started from:
git show fed518041ac6:net/ipv4/tcp_bbr.c > net/ipv4/tcp_bbr.c
then removed dev instrumentation (dynamic get/set for parameters)
and code that was only used by BBRv1
Effort: net-tcp_bbr
Origin-9xx-SHA1: 2c84098e60bed6d67dde23cd7538c51dee273102
Change-Id: I125cf26ba2a7a686f2fa5e87f4c2afceb65f7a05
Signed-off-by: Alexandre Frade <[email protected]>
---
include/net/inet_connection_sock.h | 4 +-
include/net/tcp.h | 2 +-
include/uapi/linux/inet_diag.h | 23 +
net/ipv4/Kconfig | 21 +-
net/ipv4/tcp_bbr.c | 2217 +++++++++++++++++++++-------
5 files changed, 1742 insertions(+), 525 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5d2fcc137b88..3f7d429f73e5 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -135,8 +135,8 @@ struct inet_connection_sock {
u32 icsk_probes_tstamp;
u32 icsk_user_timeout;
- u64 icsk_ca_priv[104 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv)
+#define ICSK_CA_PRIV_SIZE (144)
+ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
};
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5398a2b09e25..7db5d4df2a52 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2260,7 +2260,7 @@ struct tcp_plb_state {
u8 consec_cong_rounds:5, /* consecutive congested rounds */
unused:3;
u32 pause_until; /* jiffies32 when PLB can resume rerouting */
-};
+} __attribute__ ((__packed__));
static inline void tcp_plb_init(const struct sock *sk,
struct tcp_plb_state *plb)
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..82f8bd8f0d16 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -229,6 +229,29 @@ struct tcp_bbr_info {
__u32 bbr_min_rtt; /* min-filtered RTT in uSec */
__u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */
__u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */
+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */
+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */
+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */
+ __u8 bbr_mode; /* current bbr_mode in state machine */
+ __u8 bbr_phase; /* current state machine phase */
+ __u8 unused1; /* alignment padding; not used yet */
+ __u8 bbr_version; /* BBR algorithm version */
+ __u32 bbr_inflight_lo; /* lower short-term data volume bound */
+ __u32 bbr_inflight_hi; /* higher long-term data volume bound */
+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */
+};
+
+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
+enum tcp_bbr_phase {
+ BBR_PHASE_INVALID = 0,
+ BBR_PHASE_STARTUP = 1,
+ BBR_PHASE_DRAIN = 2,
+ BBR_PHASE_PROBE_RTT = 3,
+ BBR_PHASE_PROBE_BW_UP = 4,
+ BBR_PHASE_PROBE_BW_DOWN = 5,
+ BBR_PHASE_PROBE_BW_CRUISE = 6,
+ BBR_PHASE_PROBE_BW_REFILL = 7,
};
union tcp_cc_info {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 2dfb12230f08..2e14db3bee70 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -668,15 +668,18 @@ config TCP_CONG_BBR
default n
help
- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
- maximize network utilization and minimize queues. It builds an explicit
- model of the bottleneck delivery rate and path round-trip propagation
- delay. It tolerates packet loss and delay unrelated to congestion. It
- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
- coexist with flows that use loss-based congestion control, and can
- operate with shallow buffers, deep buffers, bufferbloat, policers, or
- AQM schemes that do not provide a delay signal. It requires the fq
- ("Fair Queue") pacing packet scheduler.
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
+ model-based congestion control algorithm that aims to maximize
+ network utilization, keep queues and retransmit rates low, and to be
+ able to coexist with Reno/CUBIC in common scenarios. It builds an
+ explicit model of the network path. It tolerates a targeted degree
+ of random packet loss and delay. It can operate over LAN, WAN,
+ cellular, wifi, or cable modem links, and can use shallow-threshold
+ ECN signals. It can coexist to some degree with flows that use
+ loss-based congestion control, and can operate with shallow buffers,
+ deep buffers, bufferbloat, policers, or AQM schemes that do not
+ provide a delay signal. It requires pacing, using either TCP internal
+ pacing or the fq ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index cd31cd8cdc69..4fec37e8f900 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1,18 +1,19 @@
-/* Bottleneck Bandwidth and RTT (BBR) congestion control
+/* BBR (Bottleneck Bandwidth and RTT) congestion control
*
- * BBR congestion control computes the sending rate based on the delivery
- * rate (throughput) estimated from ACKs. In a nutshell:
+ * BBR is a model-based congestion control algorithm that aims for low queues,
+ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
+ * network path, it uses measurements of bandwidth and RTT, as well as (if they
+ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
+ * it can use ECN or loss signals explicitly, it does not require either; it
+ * can bound its in-flight data based on its estimate of the BDP.
*
- * On each ACK, update our model of the network path:
- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
- * min_rtt = windowed_min(rtt, 10 seconds)
- * pacing_rate = pacing_gain * bottleneck_bandwidth
- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
- *
- * The core algorithm does not react directly to packet losses or delays,
- * although BBR may adjust the size of next send per ACK when loss is
- * observed, or adjust the sending rate if it estimates there is a
- * traffic policer, in order to keep the drop rate reasonable.
+ * The model has both higher and lower bounds for the operating range:
+ * lo: bw_lo, inflight_lo: conservative short-term lower bound
+ * hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
*
* Here is a state transition diagram for BBR:
*
@@ -65,6 +66,13 @@
#include <linux/random.h>
#include <linux/win_minmax.h>
+#include <trace/events/tcp.h>
+#include "tcp_dctcp.h"
+
+#define BBR_VERSION 3
+
+#define bbr_param(sk,name) (bbr_ ## name)
+
/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
* estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
* This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
@@ -85,36 +93,41 @@ enum bbr_mode {
BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */
};
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */
+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */
+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */
+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */
+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */
+};
+
/* BBR congestion control block */
struct bbr {
u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
u32 min_rtt_stamp; /* timestamp of min_rtt_us */
u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
- u32 rtt_cnt; /* count of packet-timed rounds elapsed */
+ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */
+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/
u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
u64 cycle_mstamp; /* time of this cycle phase start */
- u32 mode:3, /* current bbr_mode in state machine */
+ u32 mode:2, /* current bbr_mode in state machine */
prev_ca_state:3, /* CA state on previous ACK */
- packet_conservation:1, /* use packet conservation? */
round_start:1, /* start of packet-timed tx->ack round? */
+ ce_state:1, /* If most recent data has CE bit set */
+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */
+ try_fast_path:1, /* can we take fast path? */
idle_restart:1, /* restarting after idle? */
probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
- unused:13,
- lt_is_sampling:1, /* taking long-term ("LT") samples now? */
- lt_rtt_cnt:7, /* round trips in long-term interval */
- lt_use_bw:1; /* use lt_bw as our bw estimate? */
- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
- u32 lt_last_delivered; /* LT intvl start: tp->delivered */
- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
- u32 lt_last_lost; /* LT intvl start: tp->lost */
+ init_cwnd:7, /* initial cwnd */
+ unused_1:10;
u32 pacing_gain:10, /* current gain for setting pacing rate */
cwnd_gain:10, /* current gain for setting cwnd */
full_bw_reached:1, /* reached full bw in Startup? */
full_bw_cnt:2, /* number of rounds without large bw gains */
- cycle_idx:3, /* current index in pacing_gain cycle array */
+ cycle_idx:2, /* current index in pacing_gain cycle array */
has_seen_rtt:1, /* have we seen an RTT sample yet? */
- unused_b:5;
+ unused_2:6;
u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
u32 full_bw; /* recent bw, to estimate if pipe is full */
@@ -124,19 +137,67 @@ struct bbr {
u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
extra_acked_win_idx:1, /* current index in extra_acked array */
- unused_c:6;
+ /* BBR v3 state: */
+ full_bw_now:1, /* recently reached full bw plateau? */
+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */
+ loss_in_cycle:1, /* packet loss in this cycle? */
+ ecn_in_cycle:1, /* ECN in this cycle? */
+ unused_3:1;
+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */
+ u32 undo_bw_lo; /* bw_lo before latest losses */
+ u32 undo_inflight_lo; /* inflight_lo before latest losses */
+ u32 undo_inflight_hi; /* inflight_hi before latest losses */
+ u32 bw_latest; /* max delivered bw in last round trip */
+ u32 bw_lo; /* lower bound on sending bandwidth */
+ u32 bw_hi[2]; /* max recent measured bw sample */
+ u32 inflight_latest; /* max delivered data in last round trip */
+ u32 inflight_lo; /* lower bound of inflight data range */
+ u32 inflight_hi; /* upper bound of inflight data range */
+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */
+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */
+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */
+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */
+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */
+ bw_probe_samples:1, /* rate samples reflect bw probing? */
+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+ rounds_since_probe:8, /* packet-timed rounds since probed bw */
+ loss_round_start:1, /* loss_round_delivered round trip? */
+ loss_in_round:1, /* loss marked in this round trip? */
+ ecn_in_round:1, /* ECN marked in this round trip? */
+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */
+ loss_events_in_round:4,/* losses in STARTUP round */
+ initialized:1; /* has bbr_init() been called? */
+ u32 alpha_last_delivered; /* tp->delivered at alpha update */
+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+ u8 unused_4; /* to preserve alignment */
+ struct tcp_plb_state plb;
};
-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
+struct bbr_context {
+ u32 sample_bw;
+};
-/* Window length of bw filter (in rounds): */
-static const int bbr_bw_rtts = CYCLE_LEN + 2;
/* Window length of min_rtt filter (in sec): */
static const u32 bbr_min_rtt_win_sec = 10;
/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
static const u32 bbr_probe_rtt_mode_ms = 200;
-/* Skip TSO below the following bandwidth (bits/sec): */
-static const int bbr_min_tso_rate = 1200000;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static const u32 bbr_probe_rtt_win_ms = 5000;
+/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
+static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static const u32 bbr_tso_rtt_shift = 9;
/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
* In order to help drive the network toward lower queues and low latency while
@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
*/
static const int bbr_pacing_margin_percent = 1;
-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
* Reno or CUBIC flow would:
*/
-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
+/* The gain for deriving startup cwnd: */
+static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
+/* The pacing gain in BBR_DRAIN is calculated to typically drain
* the queue created in BBR_STARTUP in a single round:
*/
static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
static const int bbr_cwnd_gain = BBR_UNIT * 2;
/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
static const int bbr_pacing_gain[] = {
- BBR_UNIT * 5 / 4, /* probe for more available bw */
- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */
- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */
- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */
+ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */
+ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */
+ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */
+ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */
+};
+enum bbr_pacing_gain_phase {
+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */
+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */
+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */
+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */
};
-/* Randomize the starting gain cycling phase over N phases: */
-static const u32 bbr_cycle_rand = 7;
/* Try to keep at least this many packets in flight, if things go smoothly. For
* smooth functioning, a sliding window protocol ACKing every other packet
@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
*/
static const u32 bbr_cwnd_min_target = 4;
-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
/* If bw has increased significantly (1.25x), there may be more bw available: */
static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
static const u32 bbr_full_bw_cnt = 3;
-/* "long-term" ("LT") bandwidth estimator parameters... */
-/* The minimum number of rounds in an LT bw sampling interval: */
-static const u32 bbr_lt_intvl_min_rtts = 4;
-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
-static const u32 bbr_lt_loss_thresh = 50;
-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
-static const u32 bbr_lt_bw_diff = 4000 / 8;
-/* If we estimate we're policed, use lt_bw for this many round trips: */
-static const u32 bbr_lt_bw_max_rtts = 48;
-
/* Gain factor for adding extra_acked to target cwnd: */
static const int bbr_extra_acked_gain = BBR_UNIT;
/* Window length of extra_acked window. */
@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
/* Time period for clamping cwnd increment due to ack aggregation */
static const u32 bbr_extra_acked_max_us = 100 * 1000;
+/* Flags to control BBR ECN-related behavior... */
+
+/* Ensure ACKs only ACK packets with consistent ECN CE status? */
+static const bool bbr_precise_ece_ack = true;
+
+/* Max RTT (in usec) at which to use sender-side ECN logic.
+ * Disabled when 0 (ECN allowed at any RTT).
+ */
+static const u32 bbr_ecn_max_rtt_us = 5000;
+
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
+ * No loss response when 0.
+ */
+static const u32 bbr_beta = BBR_UNIT * 30 / 100;
+
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
+static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
+
+/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
+ * to congestion if the bottleneck is congested when the flow starts up.
+ */
+static const u32 bbr_ecn_alpha_init = BBR_UNIT;
+
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
+ * No ECN based bounding when 0.
+ */
+static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */
+
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
+ * Scaled by BBR_SCALE. Disabled when 0.
+ */
+static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */
+
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
+ * clears then make the first round's increment to inflight_hi the following
+ * fraction of inflight_hi.
+ */
+static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
+
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
+static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */
+
+/* Slow down for a packet loss recovered by TLP? */
+static const bool bbr_loss_probe_recovery = true;
+
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
+ * and loss rate is higher than bbr_loss_thresh.
+ * Disabled if 0.
+ */
+static const u32 bbr_full_loss_cnt = 6;
+
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
+ * meets this count.
+ */
+static const u32 bbr_full_ecn_cnt = 2;
+
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
+static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
+
+/* How much do we increase cwnd_gain when probing for bandwidth in
+ * BBR_BW_PROBE_UP? This specifies the increment in units of
+ * BBR_UNIT/4. The default is 1, meaning 0.25.
+ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
+ */
+static const u32 bbr_bw_probe_cwnd_gain = 1;
+
+/* Max number of packet-timed rounds to wait before probing for bandwidth. If
+ * we want to tolerate 1% random loss per round, and not have this cut our
+ * inflight too much, we must probe for bw periodically on roughly this scale.
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ */
+static const u32 bbr_bw_probe_max_rounds = 63;
+
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
+ */
+static const u32 bbr_bw_probe_rand_rounds = 2;
+
+/* Use BBR-native probe time scale starting at this many usec.
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
+ */
+static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */
+
+/* Use BBR-native probes spread over this many usec: */
+static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */
+
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
+static const bool bbr_fast_path = true;
+
+/* Use fast ack mode? */
+static const bool bbr_fast_ack_mode = true;
+
+static u32 bbr_max_bw(const struct sock *sk);
+static u32 bbr_bw(const struct sock *sk);
+static void bbr_exit_probe_rtt(struct sock *sk);
+static void bbr_reset_congestion_signals(struct sock *sk);
+static void bbr_run_loss_probe_recovery(struct sock *sk);
+
static void bbr_check_probe_rtt_done(struct sock *sk);
+/* This connection can use ECN if both endpoints have signaled ECN support in
+ * the handshake and the per-route settings indicated this is a
+ * shallow-threshold ECN environment, meaning both:
+ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
+ * (b) TCP endpoints provide precise ACKs that only ACK data segments
+ * with consistent ECN CE status
+ */
+static bool bbr_can_use_ecn(const struct sock *sk)
+{
+ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
+ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
+}
+
/* Do we estimate that STARTUP filled the pipe? */
static bool bbr_full_bw_reached(const struct sock *sk)
{
@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
static u32 bbr_max_bw(const struct sock *sk)
{
- struct bbr *bbr = inet_csk_ca(sk);
+ const struct bbr *bbr = inet_csk_ca(sk);
- return minmax_get(&bbr->bw);
+ return max(bbr->bw_hi[0], bbr->bw_hi[1]);
}
/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
static u32 bbr_bw(const struct sock *sk)
{
- struct bbr *bbr = inet_csk_ca(sk);
+ const struct bbr *bbr = inet_csk_ca(sk);
- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+ return min(bbr_max_bw(sk), bbr->bw_lo);
}
/* Return maximum extra acked in past k-2k round trips,
@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
* The order here is chosen carefully to avoid overflow of u64. This should
* work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
*/
-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+ int margin)
{
unsigned int mss = tcp_sk(sk)->mss_cache;
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
- return rate >> BW_SCALE;
+ rate *= USEC_PER_SEC / 100 * (100 - margin);
+ rate >>= BW_SCALE;
+ rate = max(rate, 1ULL);
+ return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
- rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+ rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+ bbr_pacing_margin_percent);
rate = min_t(u64, rate, sk->sk_max_pacing_rate);
return rate;
}
-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
}
bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
do_div(bw, rtt_us);
- sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+ sk->sk_pacing_rate =
+ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain));
}
/* Pace using current bw estimate and a gain factor. */
@@ -294,31 +472,38 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
sk->sk_pacing_rate = rate;
}
-/* override sysctl_tcp_min_tso_segs */
-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
-{
- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
-}
-
-/* Return the number of segments BBR would like in a TSO/GSO skb, given
- * a particular max gso size as a constraint.
+/* Return the number of segments BBR would like in a TSO/GSO skb, given a
+ * particular max gso size as a constraint. TODO: make this simpler and more
+ * consistent by switching bbr to just call tcp_tso_autosize().
*/
static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
u32 gso_max_size)
{
- u32 segs;
+ struct bbr *bbr = inet_csk_ca(sk);
+ u32 segs, r;
u64 bytes;
/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift;
+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every
+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
+ */
+ if (bbr_param(sk, tso_rtt_shift)) {
+ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */
+ bytes += GSO_LEGACY_MAX_SIZE >> r;
+ }
+
bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
- segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
+ segs = max_t(u32, bytes / mss_now,
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
return segs;
}
/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
-static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
{
return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
}
@@ -328,7 +513,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
}
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -348,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- if (event == CA_EVENT_TX_START && tp->app_limited) {
+ if (event == CA_EVENT_TX_START) {
+ if (!tp->app_limited)
+ return;
bbr->idle_restart = 1;
bbr->ack_epoch_mstamp = tp->tcp_mstamp;
bbr->ack_epoch_acked = 0;
@@ -359,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
else if (bbr->mode == BBR_PROBE_RTT)
bbr_check_probe_rtt_done(sk);
+ } else if ((event == CA_EVENT_ECN_IS_CE ||
+ event == CA_EVENT_ECN_NO_CE) &&
+ bbr_can_use_ecn(sk) &&
+ bbr_param(sk, precise_ece_ack)) {
+ u32 state = bbr->ce_state;
+ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
+ bbr->ce_state = state;
+ } else if (event == CA_EVENT_TLP_RECOVERY &&
+ bbr_param(sk, loss_probe_recovery)) {
+ bbr_run_loss_probe_recovery(sk);
}
}
@@ -381,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
* default. This should only happen when the connection is not using TCP
* timestamps and has retransmitted all of the SYN/SYNACK/data packets
* ACKed so far. In this case, an RTO can cut cwnd to 1, in which
- * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+ * case we need to slow-start up toward something safe: initial cwnd.
*/
if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */
- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/
+ return bbr->init_cwnd; /* be safe: cap at initial cwnd */
w = (u64)bw * bbr->min_rtt_us;
@@ -401,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
* - one skb in sending host Qdisc,
* - one skb in sending host TSO/GSO engine
* - one skb being received by receiver host LRO/GRO/delayed-ACK engine
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * Don't worry, at low rates this won't bloat cwnd because
+ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/
static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
{
struct bbr *bbr = inet_csk_ca(sk);
+ u32 tso_segs_goal;
- /* Allow enough full-sized skbs in flight to utilize end systems. */
- cwnd += 3 * bbr_tso_segs_goal(sk);
-
- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */
- cwnd = (cwnd + 1) & ~1U;
+ tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
+ /* Allow enough full-sized skbs in flight to utilize end systems. */
+ cwnd = max_t(u32, cwnd, tso_segs_goal);
+ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
cwnd += 2;
return cwnd;
@@ -472,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
{
u32 max_aggr_cwnd, aggr_cwnd = 0;
- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+ if (bbr_param(sk, extra_acked_gain)) {
max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
/ BW_UNIT;
- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
>> BBR_SCALE;
aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
}
@@ -483,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
return aggr_cwnd;
}
-/* An optimization in BBR to reduce losses: On the first round of recovery, we
- * follow the packet conservation principle: send P packets per P packets acked.
- * After that, we slow-start and send at most 2*P packets per P packets acked.
- * After recovery finishes, or upon undo, we restore the cwnd we had when
- * recovery started (capped by the target cwnd based on estimated BDP).
- *
- * TODO(ycheng/ncardwell): implement a rate-based approach.
- */
-static bool bbr_set_cwnd_to_recover_or_restore(
- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+/* Returns the cwnd for PROBE_RTT mode. */
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct bbr *bbr = inet_csk_ca(sk);
- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
- u32 cwnd = tcp_snd_cwnd(tp);
-
- /* An ACK for P pkts should release at most 2*P packets. We do this
- * in two steps. First, here we deduct the number of lost packets.
- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
- */
- if (rs->losses > 0)
- cwnd = max_t(s32, cwnd - rs->losses, 1);
-
- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
- /* Starting 1st round of Recovery, so do packet conservation. */
- bbr->packet_conservation = 1;
- bbr->next_rtt_delivered = tp->delivered; /* start round now */
- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
- cwnd = tcp_packets_in_flight(tp) + acked;
- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
- /* Exiting loss recovery; restore cwnd saved before recovery. */
- cwnd = max(cwnd, bbr->prior_cwnd);
- bbr->packet_conservation = 0;
- }
- bbr->prev_ca_state = state;
-
- if (bbr->packet_conservation) {
- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
- return true; /* yes, using packet conservation */
- }
- *new_cwnd = cwnd;
- return false;
+ return max_t(u32, bbr_param(sk, cwnd_min_target),
+ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
}
/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
* has drawn us down below target), or snap down to target if we're above it.
*/
static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
- u32 acked, u32 bw, int gain)
+ u32 acked, u32 bw, int gain, u32 cwnd,
+ struct bbr_context *ctx)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
+ u32 target_cwnd = 0;
if (!acked)
goto done; /* no packet fully ACKed; just apply caps */
- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
- goto done;
-
target_cwnd = bbr_bdp(sk, bw, gain);
/* Increment the cwnd to account for excess ACKed data that seems
@@ -551,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
target_cwnd += bbr_ack_aggregation_cwnd(sk);
target_cwnd = bbr_quantization_budget(sk, target_cwnd);
- /* If we're below target cwnd, slow start cwnd toward target cwnd. */
- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
- cwnd = min(cwnd + acked, target_cwnd);
- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
- cwnd = cwnd + acked;
- cwnd = max(cwnd, bbr_cwnd_min_target);
+ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
+ bbr->try_fast_path = 0;
+ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
+ cwnd += acked;
+ if (cwnd >= target_cwnd) {
+ cwnd = target_cwnd;
+ bbr->try_fast_path = 1;
+ }
+ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) {
+ cwnd += acked;
+ } else {
+ bbr->try_fast_path = 1;
+ }
+ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
done:
- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */
if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
-}
-
-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
-static bool bbr_is_next_cycle_phase(struct sock *sk,
- const struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct bbr *bbr = inet_csk_ca(sk);
- bool is_full_length =
- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
- bbr->min_rtt_us;
- u32 inflight, bw;
-
- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully
- * use the pipe without increasing the queue.
- */
- if (bbr->pacing_gain == BBR_UNIT)
- return is_full_length; /* just use wall clock time */
-
- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
- bw = bbr_max_bw(sk);
-
- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
- * small (e.g. on a LAN). We do not persist if packets are lost, since
- * a path with small buffers may not hold that much.
- */
- if (bbr->pacing_gain > BBR_UNIT)
- return is_full_length &&
- (rs->losses || /* perhaps pacing_gain*BDP won't fit */
- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
-
- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
- * probing didn't find more bw. If inflight falls to match BDP then we
- * estimate queue is drained; persisting would underutilize the pipe.
- */
- return is_full_length ||
- inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-}
-
-static void bbr_advance_cycle_phase(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct bbr *bbr = inet_csk_ca(sk);
-
- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
- bbr->cycle_mstamp = tp->delivered_mstamp;
-}
-
-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
-static void bbr_update_cycle_phase(struct sock *sk,
- const struct rate_sample *rs)
-{
- struct bbr *bbr = inet_csk_ca(sk);
-
- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
- bbr_advance_cycle_phase(sk);
+ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
+ bbr_probe_rtt_cwnd(sk)));
}
static void bbr_reset_startup_mode(struct sock *sk)
@@ -628,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
bbr->mode = BBR_STARTUP;
}
-static void bbr_reset_probe_bw_mode(struct sock *sk)
-{
- struct bbr *bbr = inet_csk_ca(sk);
-