diff options
| author | David S. Miller <davem@davemloft.net> | 2016-09-21 00:23:09 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2016-09-21 00:23:09 -0400 |
| commit | a624f93ce6623d452e87d8dcf557e7c680822991 (patch) | |
| tree | f6d7ecddbe3fd16a6a48374f2c647d40f25b77e5 /lib | |
| parent | 94d308d060cd3ee65152b8ebd7a1c24fa86eee82 (diff) | |
| parent | 0f8782ea14974ce992618b55f0c041ef43ed0b78 (diff) | |
Merge branch 'tcp-bbr'
Neal Cardwell says:
====================
tcp: BBR congestion control algorithm
This patch series implements a new TCP congestion control algorithm:
BBR (Bottleneck Bandwidth and RTT). A paper with a detailed
description of BBR will be published in ACM Queue, September-October
2016, as "BBR: Congestion-Based Congestion Control". BBR is widely
deployed in production at Google.
The patch series starts with a set of supporting infrastructure
changes, including a few that extend the congestion control
framework. The last patch adds BBR as a TCP congestion control
module. Please see individual patches for the details.
- v3 -> v4:
- Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
to use const to qualify all the constant parameters.
Thanks to Stephen Hemminger.
- In "tcp_bbr: add BBR congestion control", remove the bbr_rate_kbps()
function, which had a 64-bit divide that would be problematic on some
architectures, and just use bbr_rate_bytes_per_sec() directly.
Thanks to Kenneth Klette Jonassen for suggesting this.
- In "tcp: switch back to proper tcp_skb_cb size check in tcp_init()",
switched from sizeof(skb->cb) to FIELD_SIZEOF.
Thanks to Lance Richardson for suggesting this.
- Updated "tcp_bbr: add BBR congestion control" commit message with
performance data, more details about deployment at Google, and
another reminder to use fq with BBR.
- Updated tcp_bbr.c in "tcp_bbr: add BBR congestion control"
to use MODULE_LICENSE("Dual BSD/GPL").
- v2 -> v3: fix another issue caught by build bots:
- adjust rate_sample struct initialization syntax to allow gcc-4.4 to compile
the "tcp: track data delivery rate for a TCP connection" patch; also
adjusted some similar syntax in "tcp_bbr: add BBR congestion control"
- v1 -> v2: fix issues caught by build bots:
- fix "tcp: export data delivery rate" to use rate64 instead of rate,
so there is a 64-bit numerator for the do_div call
- fix conflicting definitions for minmax caused by
"tcp: use windowed min filter library for TCP min_rtt estimation"
with a new commit:
tcp: cdg: rename struct minmax in tcp_cdg.c to avoid a naming conflict
- fix warning about the use of __packed in
"tcp: track data delivery rate for a TCP connection",
which involves the addition of a new commit:
tcp: switch back to proper tcp_skb_cb size check in tcp_init()
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/Makefile | 2 | ||||
| -rw-r--r-- | lib/win_minmax.c | 98 |
2 files changed, 99 insertions, 1 deletions
diff --git a/lib/Makefile b/lib/Makefile index 5dc77a8ec297..df747e5eeb7a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include <linux/module.h> +#include <linux/win_minmax.h> + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} |
