diff options
Diffstat (limited to 'net/openvswitch/flow_table.c')
| -rw-r--r-- | net/openvswitch/flow_table.c | 322 | 
1 files changed, 282 insertions, 40 deletions
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 2398d7238300..e2235849a57e 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -29,6 +29,7 @@  #include <linux/icmp.h>  #include <linux/icmpv6.h>  #include <linux/rculist.h> +#include <linux/sort.h>  #include <net/ip.h>  #include <net/ipv6.h>  #include <net/ndisc.h> @@ -37,8 +38,8 @@  #define MASK_ARRAY_SIZE_MIN	16  #define REHASH_INTERVAL		(10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES	256  #define MC_HASH_SHIFT		8 -#define MC_HASH_ENTRIES		(1u << MC_HASH_SHIFT)  #define MC_HASH_SEGS		((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)  static struct kmem_cache *flow_cache; @@ -169,16 +170,70 @@ static struct table_instance *table_instance_alloc(int new_size)  	return ti;  } +static void __mask_array_destroy(struct mask_array *ma) +{ +	free_percpu(ma->masks_usage_cntr); +	kfree(ma); +} + +static void mask_array_rcu_cb(struct rcu_head *rcu) +{ +	struct mask_array *ma = container_of(rcu, struct mask_array, rcu); + +	__mask_array_destroy(ma); +} + +static void tbl_mask_array_reset_counters(struct mask_array *ma) +{ +	int i, cpu; + +	/* As the per CPU counters are not atomic we can not go ahead and +	 * reset them from another CPU. To be able to still have an approximate +	 * zero based counter we store the value at reset, and subtract it +	 * later when processing. +	 */ +	for (i = 0; i < ma->max; i++)  { +		ma->masks_usage_zero_cntr[i] = 0; + +		for_each_possible_cpu(cpu) { +			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, +							  cpu); +			unsigned int start; +			u64 counter; + +			do { +				start = u64_stats_fetch_begin_irq(&ma->syncp); +				counter = usage_counters[i]; +			} while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + +			ma->masks_usage_zero_cntr[i] += counter; +		} +	} +} +  static struct mask_array *tbl_mask_array_alloc(int size)  {  	struct mask_array *new;  	size = max(MASK_ARRAY_SIZE_MIN, size);  	new = kzalloc(sizeof(struct mask_array) + -		      sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); +		      sizeof(struct sw_flow_mask *) * size + +		      sizeof(u64) * size, GFP_KERNEL);  	if (!new)  		return NULL; +	new->masks_usage_zero_cntr = (u64 *)((u8 *)new + +					     sizeof(struct mask_array) + +					     sizeof(struct sw_flow_mask *) * +					     size); + +	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size, +					       __alignof__(u64)); +	if (!new->masks_usage_cntr) { +		kfree(new); +		return NULL; +	} +  	new->count = 0;  	new->max = size; @@ -202,10 +257,10 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size)  			if (ovsl_dereference(old->masks[i]))  				new->masks[new->count++] = old->masks[i];  		} +		call_rcu(&old->rcu, mask_array_rcu_cb);  	}  	rcu_assign_pointer(tbl->mask_array, new); -	kfree_rcu(old, rcu);  	return 0;  } @@ -223,6 +278,11 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,  			return err;  		ma = ovsl_dereference(tbl->mask_array); +	} else { +		/* On every add or delete we need to reset the counters so +		 * every new mask gets a fair chance of being prioritized. +		 */ +		tbl_mask_array_reset_counters(ma);  	}  	BUG_ON(ovsl_dereference(ma->masks[ma_count])); @@ -260,6 +320,9 @@ found:  	if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&  	    ma_count <= (ma->max / 3))  		tbl_mask_array_realloc(tbl, ma->max / 2); +	else +		tbl_mask_array_reset_counters(ma); +  }  /* Remove 'mask' from the mask list, if it is not needed any more. */ @@ -278,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)  	}  } +static void __mask_cache_destroy(struct mask_cache *mc) +{ +	free_percpu(mc->mask_cache); +	kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ +	struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + +	__mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ +	struct mask_cache_entry __percpu *cache = NULL; +	struct mask_cache *new; + +	/* Only allow size to be 0, or a power of 2, and does not exceed +	 * percpu allocation size. +	 */ +	if ((!is_power_of_2(size) && size != 0) || +	    (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) +		return NULL; + +	new = kzalloc(sizeof(*new), GFP_KERNEL); +	if (!new) +		return NULL; + +	new->cache_size = size; +	if (new->cache_size > 0) { +		cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), +						  new->cache_size), +				       __alignof__(struct mask_cache_entry)); +		if (!cache) { +			kfree(new); +			return NULL; +		} +	} + +	new->mask_cache = cache; +	return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ +	struct mask_cache *mc = rcu_dereference(table->mask_cache); +	struct mask_cache *new; + +	if (size == mc->cache_size) +		return 0; + +	if ((!is_power_of_2(size) && size != 0) || +	    (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) +		return -EINVAL; + +	new = tbl_mask_cache_alloc(size); +	if (!new) +		return -ENOMEM; + +	rcu_assign_pointer(table->mask_cache, new); +	call_rcu(&mc->rcu, mask_cache_rcu_cb); + +	return 0; +} +  int ovs_flow_tbl_init(struct flow_table *table)  {  	struct table_instance *ti, *ufid_ti; +	struct mask_cache *mc;  	struct mask_array *ma; -	table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * -					   MC_HASH_ENTRIES, -					   __alignof__(struct mask_cache_entry)); -	if (!table->mask_cache) +	mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); +	if (!mc)  		return -ENOMEM;  	ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -304,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table)  	rcu_assign_pointer(table->ti, ti);  	rcu_assign_pointer(table->ufid_ti, ufid_ti);  	rcu_assign_pointer(table->mask_array, ma); +	rcu_assign_pointer(table->mask_cache, mc);  	table->last_rehash = jiffies;  	table->count = 0;  	table->ufid_count = 0; @@ -312,9 +440,9 @@ int ovs_flow_tbl_init(struct flow_table *table)  free_ti:  	__table_instance_destroy(ti);  free_mask_array: -	kfree(ma); +	__mask_array_destroy(ma);  free_mask_cache: -	free_percpu(table->mask_cache); +	__mask_cache_destroy(mc);  	return -ENOMEM;  } @@ -345,19 +473,15 @@ static void table_instance_flow_free(struct flow_table *table,  	flow_mask_remove(table, flow->mask);  } -static void table_instance_destroy(struct flow_table *table, -				   struct table_instance *ti, -				   struct table_instance *ufid_ti, -				   bool deferred) +/* Must be called with OVS mutex held. */ +void table_instance_flow_flush(struct flow_table *table, +			       struct table_instance *ti, +			       struct table_instance *ufid_ti)  {  	int i; -	if (!ti) -		return; - -	BUG_ON(!ufid_ti);  	if (ti->keep_flows) -		goto skip_flows; +		return;  	for (i = 0; i < ti->n_buckets; i++) {  		struct sw_flow *flow; @@ -369,18 +493,16 @@ static void table_instance_destroy(struct flow_table *table,  			table_instance_flow_free(table, ti, ufid_ti,  						 flow, false); -			ovs_flow_free(flow, deferred); +			ovs_flow_free(flow, true);  		}  	} +} -skip_flows: -	if (deferred) { -		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); -		call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); -	} else { -		__table_instance_destroy(ti); -		__table_instance_destroy(ufid_ti); -	} +static void table_instance_destroy(struct table_instance *ti, +				   struct table_instance *ufid_ti) +{ +	call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); +	call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);  }  /* No need for locking this function is called from RCU callback or @@ -390,10 +512,12 @@ void ovs_flow_tbl_destroy(struct flow_table *table)  {  	struct table_instance *ti = rcu_dereference_raw(table->ti);  	struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); +	struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); +	struct mask_array *ma = rcu_dereference_raw(table->mask_array); -	free_percpu(table->mask_cache); -	kfree_rcu(rcu_dereference_raw(table->mask_array), rcu); -	table_instance_destroy(table, ti, ufid_ti, false); +	call_rcu(&mc->rcu, mask_cache_rcu_cb); +	call_rcu(&ma->rcu, mask_array_rcu_cb); +	table_instance_destroy(ti, ufid_ti);  }  struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -511,7 +635,8 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)  	flow_table->count = 0;  	flow_table->ufid_count = 0; -	table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); +	table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); +	table_instance_destroy(old_ti, old_ufid_ti);  	return 0;  err_free_ti: @@ -604,8 +729,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,  				   struct mask_array *ma,  				   const struct sw_flow_key *key,  				   u32 *n_mask_hit, +				   u32 *n_cache_hit,  				   u32 *index)  { +	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);  	struct sw_flow *flow;  	struct sw_flow_mask *mask;  	int i; @@ -614,8 +741,13 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,  		mask = rcu_dereference_ovsl(ma->masks[*index]);  		if (mask) {  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit); -			if (flow) +			if (flow) { +				u64_stats_update_begin(&ma->syncp); +				usage_counters[*index]++; +				u64_stats_update_end(&ma->syncp); +				(*n_cache_hit)++;  				return flow; +			}  		}  	} @@ -631,6 +763,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);  		if (flow) { /* Found */  			*index = i; +			u64_stats_update_begin(&ma->syncp); +			usage_counters[*index]++; +			u64_stats_update_end(&ma->syncp);  			return flow;  		}  	} @@ -648,8 +783,10 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,  struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,  					  const struct sw_flow_key *key,  					  u32 skb_hash, -					  u32 *n_mask_hit) +					  u32 *n_mask_hit, +					  u32 *n_cache_hit)  { +	struct mask_cache *mc = rcu_dereference(tbl->mask_cache);  	struct mask_array *ma = rcu_dereference(tbl->mask_array);  	struct table_instance *ti = rcu_dereference(tbl->ti);  	struct mask_cache_entry *entries, *ce; @@ -658,10 +795,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,  	int seg;  	*n_mask_hit = 0; -	if (unlikely(!skb_hash)) { +	*n_cache_hit = 0; +	if (unlikely(!skb_hash || mc->cache_size == 0)) {  		u32 mask_index = 0; +		u32 cache = 0; -		return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); +		return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, +				   &mask_index);  	}  	/* Pre and post recirulation flows usually have the same skb_hash @@ -672,17 +812,17 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,  	ce = NULL;  	hash = skb_hash; -	entries = this_cpu_ptr(tbl->mask_cache); +	entries = this_cpu_ptr(mc->mask_cache);  	/* Find the cache entry 'ce' to operate on. */  	for (seg = 0; seg < MC_HASH_SEGS; seg++) { -		int index = hash & (MC_HASH_ENTRIES - 1); +		int index = hash & (mc->cache_size - 1);  		struct mask_cache_entry *e;  		e = &entries[index];  		if (e->skb_hash == skb_hash) {  			flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, -					   &e->mask_index); +					   n_cache_hit, &e->mask_index);  			if (!flow)  				e->skb_hash = 0;  			return flow; @@ -695,10 +835,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,  	}  	/* Cache miss, do full lookup. */ -	flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); +	flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, +			   &ce->mask_index);  	if (flow)  		ce->skb_hash = skb_hash; +	*n_cache_hit = 0;  	return flow;  } @@ -708,9 +850,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,  	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);  	u32 __always_unused n_mask_hit; +	u32 __always_unused n_cache_hit;  	u32 index = 0; -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); +	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);  }  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, @@ -787,6 +930,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table)  	return READ_ONCE(ma->count);  } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ +	struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + +	return READ_ONCE(mc->cache_size); +} +  static struct table_instance *table_instance_expand(struct table_instance *ti,  						    bool ufid)  { @@ -934,6 +1084,98 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,  	return 0;  } +static int compare_mask_and_count(const void *a, const void *b) +{ +	const struct mask_count *mc_a = a; +	const struct mask_count *mc_b = b; + +	return (s64)mc_b->counter - (s64)mc_a->counter; +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_masks_rebalance(struct flow_table *table) +{ +	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); +	struct mask_count *masks_and_count; +	struct mask_array *new; +	int masks_entries = 0; +	int i; + +	/* Build array of all current entries with use counters. */ +	masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), +					GFP_KERNEL); +	if (!masks_and_count) +		return; + +	for (i = 0; i < ma->max; i++)  { +		struct sw_flow_mask *mask; +		unsigned int start; +		int cpu; + +		mask = rcu_dereference_ovsl(ma->masks[i]); +		if (unlikely(!mask)) +			break; + +		masks_and_count[i].index = i; +		masks_and_count[i].counter = 0; + +		for_each_possible_cpu(cpu) { +			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr, +							  cpu); +			u64 counter; + +			do { +				start = u64_stats_fetch_begin_irq(&ma->syncp); +				counter = usage_counters[i]; +			} while (u64_stats_fetch_retry_irq(&ma->syncp, start)); + +			masks_and_count[i].counter += counter; +		} + +		/* Subtract the zero count value. */ +		masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; + +		/* Rather than calling tbl_mask_array_reset_counters() +		 * below when no change is needed, do it inline here. +		 */ +		ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; +	} + +	if (i == 0) +		goto free_mask_entries; + +	/* Sort the entries */ +	masks_entries = i; +	sort(masks_and_count, masks_entries, sizeof(*masks_and_count), +	     compare_mask_and_count, NULL); + +	/* If the order is the same, nothing to do... */ +	for (i = 0; i < masks_entries; i++) { +		if (i != masks_and_count[i].index) +			break; +	} +	if (i == masks_entries) +		goto free_mask_entries; + +	/* Rebuilt the new list in order of usage. */ +	new = tbl_mask_array_alloc(ma->max); +	if (!new) +		goto free_mask_entries; + +	for (i = 0; i < masks_entries; i++) { +		int index = masks_and_count[i].index; + +		if (ovsl_dereference(ma->masks[index])) +			new->masks[new->count++] = ma->masks[index]; +	} + +	rcu_assign_pointer(table->mask_array, new); +	call_rcu(&ma->rcu, mask_array_rcu_cb); + +free_mask_entries: +	kfree(masks_and_count); +} +  /* Initializes the flow module.   * Returns zero if successful or a negative error code. */  int ovs_flow_init(void)  | 
