From 540c8cb6a576f34a9a0b04467f46bb6e67a1f852 Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Wed, 2 Mar 2011 19:51:41 -0500 Subject: gss:krb5 only include enctype numbers in gm_upcall_enctypes Make the value in gm_upcall_enctypes just the enctype values. This allows the values to be used more easily elsewhere. Signed-off-by: Kevin Coffman Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 45dbf1521b9a..f3914d0c5079 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -417,7 +417,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { - len = sprintf(p, mech->gm_upcall_enctypes); + len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes); p += len; gss_msg->msg.len += len; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index f375decc024b..9022f0a6503e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -750,7 +750,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + .gm_upcall_enctypes = "18,17,16,23,3,1,2", }; static int __init init_kerberos_module(void) -- cgit From 8b3e07ac908d005bb791410f594cce8744f6806a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 7 Mar 2011 22:50:47 -0500 Subject: svcrpc: fix rare race on unix_domain creation Note that "new" here is not yet fully initialized; auth_domain_put should be called only on auth_domains that have actually been added to the hash. Before this fix, two attempts to add the same domain at once could cause the hlist_del in auth_domain_put to fail. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth_unix.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 30916b06c12b..d100bf2b4e81 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -38,6 +38,14 @@ struct unix_domain { extern struct auth_ops svcauth_unix; +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + struct unix_domain *ud = container_of(dom, struct unix_domain, h); + + kfree(dom->name); + kfree(ud); +} + struct auth_domain *unix_domain_find(char *name) { struct auth_domain *rv; @@ -47,7 +55,7 @@ struct auth_domain *unix_domain_find(char *name) while(1) { if (rv) { if (new && rv != &new->h) - auth_domain_put(&new->h); + svcauth_unix_domain_release(new); if (rv->flavour != &svcauth_unix) { auth_domain_put(rv); @@ -74,14 +82,6 @@ struct auth_domain *unix_domain_find(char *name) } EXPORT_SYMBOL_GPL(unix_domain_find); -static void svcauth_unix_domain_release(struct auth_domain *dom) -{ - struct unix_domain *ud = container_of(dom, struct unix_domain, h); - - kfree(dom->name); - kfree(ud); -} - /************************************************** * cache for IP address to unix_domain -- cgit From 352b5d13c0684ba8cd103aa20cb74f105334562a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 9 Mar 2011 22:40:30 -0500 Subject: svcrpc: fix bad argument in unix_domain_find "After merging the nfsd tree, today's linux-next build (powerpc ppc64_defconfig) produced this warning: net/sunrpc/svcauth_unix.c: In function 'unix_domain_find': net/sunrpc/svcauth_unix.c:58: warning: passing argument 1 of +'svcauth_unix_domain_release' from incompatible pointer type net/sunrpc/svcauth_unix.c:41: note: expected 'struct auth_domain *' but argument +is of type 'struct unix_domain *' Introduced by commit 8b3e07ac908d ("svcrpc: fix rare race on unix_domain creation")." Reported-by: Stephen Rothwell Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index d100bf2b4e81..c8e10216c113 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -55,7 +55,7 @@ struct auth_domain *unix_domain_find(char *name) while(1) { if (rv) { if (new && rv != &new->h) - svcauth_unix_domain_release(new); + svcauth_unix_domain_release(&new->h); if (rv->flavour != &svcauth_unix) { auth_domain_put(rv); -- cgit From b09734b1f4abd86e046777f0f268215b4ef1b523 Mon Sep 17 00:00:00 2001 From: Tommi Virtanen Date: Wed, 2 Feb 2011 11:39:32 -0800 Subject: libceph: Fix base64-decoding when input ends in newline. It used to return -EINVAL because it thought the end was not aligned to 4 bytes. Clean up superfluous src < end test in if, the while itself guarantees that. Signed-off-by: Tommi Virtanen Signed-off-by: Sage Weil --- net/ceph/armor.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be7..1fc1ee11dfa2 100644 --- a/net/ceph/armor.c +++ b/net/ceph/armor.c @@ -78,8 +78,10 @@ int ceph_unarmor(char *dst, const char *src, const char *end) while (src < end) { int a, b, c, d; - if (src < end && src[0] == '\n') + if (src[0] == '\n') { src++; + continue; + } if (src + 4 > end) return -EINVAL; a = decode_bits(src[0]); -- cgit From 4be34b9d69c97211ff4eb00d79078f3c1593804d Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sat, 22 Jan 2011 22:40:20 +0100 Subject: SUNRPC: Remove resource leak in svc_rdma_send_error() We leak the memory allocated to 'ctxt' when we return after 'ib_dma_mapping_error()' returns !=0. Signed-off-by: Jesper Juhl Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 9df1eadc912a..1a10dcd999ea 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1335,6 +1335,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, p, 0, length, DMA_FROM_DEVICE); if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { put_page(p); + svc_rdma_put_context(ctxt, 1); return; } atomic_inc(&xprt->sc_dma_used); -- cgit From 8bc8aecdc5e26cfda12dbd6867af4aa67836da6a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 21 Mar 2011 20:01:00 +0100 Subject: mac80211: initialize sta->last_rx in sta_info_alloc This field is used to determine the inactivity time. When in AP mode, hostapd uses it for kicking out inactive clients after a while. Without this patch, hostapd immediately deauthenticates a new client if it checks the inactivity time before the client sends its first data frame. Signed-off-by: Felix Fietkau Cc: stable@kernel.org Signed-off-by: John W. Linville --- net/mac80211/sta_info.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 5a11078827ab..d0311a322ddd 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -243,6 +243,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, memcpy(sta->sta.addr, addr, ETH_ALEN); sta->local = local; sta->sdata = sdata; + sta->last_rx = jiffies; ewma_init(&sta->avg_signal, 1024, 8); -- cgit From 6f6c7006755b667f9f6c1f3b6f08cd65f75cc471 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Jan 2011 20:34:08 -0800 Subject: libceph: fix osd request queuing on osdmap updates If we send a request to osd A, and the request's pg remaps to osd B and then back to A in quick succession, we need to resend the request to A. The old code was only calling kick_requests after processing all incremental maps in a message, so it was very possible to not resend a request that needed to be resent. This would make the osd eventually time out (at least with the current default of osd timeouts enabled). The correct approach is to scan requests on every map incremental. This patch refactors the kick code in a few ways: - all requests are either on req_lru (in flight), req_unsent (ready to send), or req_notarget (currently map to no up osd) - mapping always done by map_request (previous map_osds) - if the mapping changes, we requeue. requests are resent only after all map incrementals are processed. - some osd reset code is moved out of kick_requests into a separate function - the "kick this osd" functionality is moved to kick_osd_requests, as it is unrelated to scanning for request->pg->osd mapping changes Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 255 ++++++++++++++++++++++++-------------------------- 1 file changed, 122 insertions(+), 133 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3e20a122ffa2..b85ed5a5503d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -22,10 +22,9 @@ #define OSD_OPREPLY_FRONT_LEN 512 static const struct ceph_connection_operations osd_con_ops; -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd); -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); static int op_needs_trail(int op) { @@ -529,6 +528,35 @@ __lookup_request_ge(struct ceph_osd_client *osdc, return NULL; } +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + struct ceph_osd_request *req; + int err; + + dout("__kick_osd_requests osd%d\n", osd->o_osd); + err = __reset_osd(osdc, osd); + if (err == -EAGAIN) + return; + + list_for_each_entry(req, &osd->o_requests, r_osd_item) { + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } +} + +static void kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *kickosd) +{ + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, kickosd); + mutex_unlock(&osdc->request_mutex); +} /* * If the osd connection drops, we need to resubmit all requests. @@ -543,7 +571,8 @@ static void osd_reset(struct ceph_connection *con) dout("osd_reset osd%d\n", osd->o_osd); osdc = osd->o_osdc; down_read(&osdc->map_sem); - kick_requests(osdc, osd); + kick_osd_requests(osdc, osd); + send_queued(osdc); up_read(&osdc->map_sem); } @@ -781,20 +810,20 @@ static void __cancel_request(struct ceph_osd_request *req) ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } - list_del_init(&req->r_req_lru_item); } /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. + * no up osd, set r_osd to NULL. Move the request to the appropiate list + * (unsent, homeless) or leave on in-flight lru. * * Return 0 if unchanged, 1 if changed, or negative on error. * * Caller should hold map_sem for read and request_mutex. */ -static int __map_osds(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static int __map_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; @@ -802,11 +831,13 @@ static int __map_osds(struct ceph_osd_client *osdc, int o = -1, num = 0; int err; - dout("map_osds %p tid %lld\n", req, req->r_tid); + dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, &req->r_file_layout, osdc->osdmap); - if (err) + if (err) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; + } pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; @@ -823,7 +854,7 @@ static int __map_osds(struct ceph_osd_client *osdc, (req->r_osd == NULL && o == -1)) return 0; /* no change */ - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", + dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, req->r_osd ? req->r_osd->o_osd : -1); @@ -841,10 +872,12 @@ static int __map_osds(struct ceph_osd_client *osdc, if (!req->r_osd && o >= 0) { err = -ENOMEM; req->r_osd = create_osd(osdc); - if (!req->r_osd) + if (!req->r_osd) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; + } - dout("map_osds osd %p is osd%d\n", req->r_osd, o); + dout("map_request osd %p is osd%d\n", req->r_osd, o); req->r_osd->o_osd = o; req->r_osd->o_con.peer_name.num = cpu_to_le64(o); __insert_osd(osdc, req->r_osd); @@ -855,6 +888,9 @@ static int __map_osds(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -869,16 +905,6 @@ static int __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead; - int err; - - err = __map_osds(osdc, req); - if (err < 0) - return err; - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - return 0; - } dout("send_request %p tid %llu to osd%d flags %d\n", req, req->r_tid, req->r_osd->o_osd, req->r_flags); @@ -897,6 +923,21 @@ static int __send_request(struct ceph_osd_client *osdc, return 0; } +/* + * Send any requests in the queue (req_unsent). + */ +static void send_queued(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *tmp; + + dout("send_queued\n"); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { + __send_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); +} + /* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this @@ -916,7 +957,6 @@ static void handle_timeout(struct work_struct *work) unsigned long keepalive = osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; - struct rb_node *p; struct list_head slow_osds; dout("timeout\n"); @@ -925,21 +965,6 @@ static void handle_timeout(struct work_struct *work) ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - int err; - - dout("osdc resending prev failed %lld\n", req->r_tid); - err = __send_request(osdc, req); - if (err) - dout("osdc failed again on %lld\n", req->r_tid); - else - req->r_resend = false; - continue; - } - } /* * reset osds that appear to be _really_ unresponsive. this @@ -963,7 +988,7 @@ static void handle_timeout(struct work_struct *work) BUG_ON(!osd); pr_warning(" tid %llu timed out on osd%d, will reset osd\n", req->r_tid, osd->o_osd); - __kick_requests(osdc, osd); + __kick_osd_requests(osdc, osd); } /* @@ -991,7 +1016,7 @@ static void handle_timeout(struct work_struct *work) __schedule_osd_timeout(osdc); mutex_unlock(&osdc->request_mutex); - + send_queued(osdc); up_read(&osdc->map_sem); } @@ -1109,108 +1134,61 @@ bad: ceph_msg_dump(msg); } - -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) +static void reset_changed_osds(struct ceph_osd_client *osdc) { - struct ceph_osd_request *req; struct rb_node *p, *n; - int needmap = 0; - int err; - - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); - if (kickosd) { - err = __reset_osd(osdc, kickosd); - if (err == -EAGAIN) - return 1; - } else { - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = - rb_entry(p, struct ceph_osd, o_node); - - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); - } - } - - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - dout(" r_resend set on tid %llu\n", req->r_tid); - __cancel_request(req); - goto kick; - } - if (req->r_osd && kickosd == req->r_osd) { - __cancel_request(req); - goto kick; - } - err = __map_osds(osdc, req); - if (err == 0) - continue; /* no change */ - if (err < 0) { - /* - * FIXME: really, we should set the request - * error and fail if this isn't a 'nofail' - * request, but that's a fair bit more - * complicated to do. So retry! - */ - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - continue; - } - if (req->r_osd == NULL) { - dout("tid %llu maps to no valid osd\n", req->r_tid); - needmap++; /* request a newer map */ - continue; - } + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); -kick: - dout("kicking %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - err = __send_request(osdc, req); - if (err) { - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - } + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); } - - return needmap; } /* - * Resubmit osd requests whose osd or osd address has changed. Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. * * Caller should hold map_sem for read and request_mutex. */ -static void kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) +static void kick_requests(struct ceph_osd_client *osdc) { - int needmap; + struct ceph_osd_request *req; + struct rb_node *p; + int needmap = 0; + int err; + dout("kick_requests\n"); mutex_lock(&osdc->request_mutex); - needmap = __kick_requests(osdc, kickosd); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_osd_request, r_node); + err = __map_request(osdc, req); + if (err < 0) + continue; /* error */ + if (req->r_osd == NULL) { + dout("%p tid %llu maps to no osd\n", req, req->r_tid); + needmap++; /* request a newer map */ + } else if (err > 0) { + dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } mutex_unlock(&osdc->request_mutex); if (needmap) { dout("%d requests for down osds, need new map\n", needmap); ceph_monc_request_next_osdmap(&osdc->client->monc); } - } + + /* * Process updated osd map. * @@ -1263,6 +1241,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } + kick_requests(osdc); + reset_changed_osds(osdc); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1300,6 +1280,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) osdc->osdmap = newmap; if (oldmap) ceph_osdmap_destroy(oldmap); + kick_requests(osdc); } p += maplen; nr_maps--; @@ -1308,8 +1289,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); - if (newmap) - kick_requests(osdc, NULL); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); return; @@ -1347,15 +1327,22 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, * the request still han't been touched yet. */ if (req->r_sent == 0) { - rc = __send_request(osdc, req); - if (rc) { - if (nofail) { - dout("osdc_start_request failed send, " - " marking %lld\n", req->r_tid); - req->r_resend = true; - rc = 0; - } else { - __unregister_request(osdc, req); + rc = __map_request(osdc, req); + if (rc < 0) + return rc; + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + rc = __send_request(osdc, req); + if (rc) { + if (nofail) { + dout("osdc_start_request failed send, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } } } } @@ -1441,6 +1428,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_LIST_HEAD(&osdc->osd_lru); osdc->requests = RB_ROOT; INIT_LIST_HEAD(&osdc->req_lru); + INIT_LIST_HEAD(&osdc->req_unsent); + INIT_LIST_HEAD(&osdc->req_notarget); osdc->num_requests = 0; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); -- cgit From a454f0ccefbfdbfc0e1aa8a5f8010af5e48b8845 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 21 Mar 2011 18:08:28 -0700 Subject: xfrm: Fix initialize repl field of struct xfrm_state Commit 'xfrm: Move IPsec replay detection functions to a separate file' (9fdc4883d92d20842c5acea77a4a21bb1574b495) introduce repl field to struct xfrm_state, and only initialize it under SA's netlink create path, the other path, such as pf_key, ipcomp/ipcomp6 etc, the repl field remaining uninitialize. So if the SA is created by pf_key, any input packet with SA's encryption algorithm will cause panic. int xfrm_input() { ... x->repl->advance(x, seq); ... } This patch fixed it by introduce new function __xfrm_init_state(). Pid: 0, comm: swapper Not tainted 2.6.38-next+ #14 Bochs Bochs EIP: 0060:[] EFLAGS: 00010206 CPU: 0 EIP is at xfrm_input+0x31c/0x4cc EAX: dd839c00 EBX: 00000084 ECX: 00000000 EDX: 01000000 ESI: dd839c00 EDI: de3a0780 EBP: dec1de88 ESP: dec1de64 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process swapper (pid: 0, ti=dec1c000 task=c09c0f20 task.ti=c0992000) Stack: 00000000 00000000 00000002 c0ba27c0 00100000 01000000 de3a0798 c0ba27c0 00000033 dec1de98 c0786848 00000000 de3a0780 dec1dea4 c0786868 00000000 dec1debc c074ee56 e1da6b8c de3a0780 c074ed44 de3a07a8 dec1decc c074ef32 Call Trace: [] xfrm4_rcv_encap+0x22/0x27 [] xfrm4_rcv+0x1b/0x1d [] ip_local_deliver_finish+0x112/0x1b1 [] ? ip_local_deliver_finish+0x0/0x1b1 [] NF_HOOK.clone.1+0x3d/0x44 [] ip_local_deliver+0x3e/0x44 [] ? ip_local_deliver_finish+0x0/0x1b1 [] ip_rcv_finish+0x30a/0x332 [] ? ip_rcv_finish+0x0/0x332 [] NF_HOOK.clone.1+0x3d/0x44 [] ip_rcv+0x20b/0x247 [] ? ip_rcv_finish+0x0/0x332 [] __netif_receive_skb+0x373/0x399 [] netif_receive_skb+0x4b/0x51 [] cp_rx_poll+0x210/0x2c4 [8139cp] [] net_rx_action+0x9a/0x17d [] __do_softirq+0xa1/0x149 [] ? __do_softirq+0x0/0x149 Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 15 ++++++++++++++- net/xfrm/xfrm_user.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d575f0534868..f83a3d1da81b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1907,7 +1907,7 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu) return res; } -int xfrm_init_state(struct xfrm_state *x) +int __xfrm_init_state(struct xfrm_state *x, bool init_replay) { struct xfrm_state_afinfo *afinfo; struct xfrm_mode *inner_mode; @@ -1980,12 +1980,25 @@ int xfrm_init_state(struct xfrm_state *x) if (x->outer_mode == NULL) goto error; + if (init_replay) { + err = xfrm_init_replay(x); + if (err) + goto error; + } + x->km.state = XFRM_STATE_VALID; error: return err; } +EXPORT_SYMBOL(__xfrm_init_state); + +int xfrm_init_state(struct xfrm_state *x) +{ + return __xfrm_init_state(x, true); +} + EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 706385ae3e4b..fc152d28753c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -511,7 +511,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - err = xfrm_init_state(x); + err = __xfrm_init_state(x, false); if (err) goto error; -- cgit From 8aa525a9340da4227797a06221ca08399006635f Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 21 Mar 2011 18:10:25 -0700 Subject: l2tp: fix possible oops on l2tp_eth module unload A struct used in the l2tp_eth driver for registering network namespace ops was incorrectly marked as __net_initdata, leading to oops when module unloaded. BUG: unable to handle kernel paging request at ffffffffa00ec098 IP: [] ops_exit_list+0x7/0x4b PGD 142d067 PUD 1431063 PMD 195da8067 PTE 0 Oops: 0000 [#1] SMP last sysfs file: /sys/module/l2tp_eth/refcnt Call Trace: [] ? unregister_pernet_operations+0x32/0x93 [] ? unregister_pernet_device+0x2b/0x38 [] ? sys_delete_module+0x1b8/0x222 [] ? do_munmap+0x254/0x318 [] ? page_fault+0x25/0x30 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 8d9ce0accc98..a8193f52c13c 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -283,7 +283,7 @@ static __net_init int l2tp_eth_init_net(struct net *net) return 0; } -static __net_initdata struct pernet_operations l2tp_eth_net_ops = { +static struct pernet_operations l2tp_eth_net_ops = { .init = l2tp_eth_init_net, .id = &l2tp_eth_net_id, .size = sizeof(struct l2tp_eth_net), -- cgit From 674f2115995b7b588cbf3540c9f9b2448a8c7ea8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Mar 2011 18:16:39 -0700 Subject: ipx: fix ipx_release() Commit b0d0d915d1d1a0 (remove the BKL) added a regression, because sock_put() can free memory while we are going to use it later. Fix is to delay sock_put() _after_ release_sock(). Reported-by: Ingo Molnar Tested-by: Ingo Molnar Signed-off-by: Eric Dumazet Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 2731b51923d1..9680226640ef 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -148,7 +148,6 @@ static void ipx_destroy_socket(struct sock *sk) ipx_remove_socket(sk); skb_queue_purge(&sk->sk_receive_queue); sk_refcnt_debug_dec(sk); - sock_put(sk); } /* @@ -1404,6 +1403,7 @@ static int ipx_release(struct socket *sock) sk_refcnt_debug_release(sk); ipx_destroy_socket(sk); release_sock(sk); + sock_put(sk); out: return 0; } -- cgit From b20e7bbfc7a15a4182730f0936433145992b4b06 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Mar 2011 18:18:00 -0700 Subject: net/appletalk: fix atalk_release use after free The BKL removal in appletalk introduced a use-after-free problem, where atalk_destroy_socket frees a sock, but we still release the socket lock on it. An easy fix is to take an extra reference on the sock and sock_put it when returning from atalk_release. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 3d4f4b043406..206e771e82d1 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1051,6 +1051,7 @@ static int atalk_release(struct socket *sock) { struct sock *sk = sock->sk; + sock_hold(sk); lock_sock(sk); if (sk) { sock_orphan(sk); @@ -1058,6 +1059,8 @@ static int atalk_release(struct socket *sock) atalk_destroy_socket(sk); } release_sock(sk); + sock_put(sk); + return 0; } -- cgit From ac0a121d7906b049dfee3649f886c969fbb3c1b7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 21 Mar 2011 18:20:26 -0700 Subject: net: fix incorrect spelling in drop monitor protocol It was pointed out to me recently that my spelling could be better :) Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 36e603c78ce9..706502ff64aa 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -350,7 +350,7 @@ static int __init init_net_drop_monitor(void) struct per_cpu_dm_data *data; int cpu, rc; - printk(KERN_INFO "Initalizing network drop monitor service\n"); + printk(KERN_INFO "Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); -- cgit From 9d2a8fa96a44ba242de3a6f56acaef7a40a97b97 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Mar 2011 18:23:34 -0700 Subject: net ipv6: Fix duplicate /proc/sys/net/ipv6/neigh directory entries. When I was fixing issues with unregisgtering tables under /proc/sys/net/ipv6/neigh by adding a mount point it appears I missed a critical ordering issue, in the ipv6 initialization. I had not realized that ipv6_sysctl_register is called at the very end of the ipv6 initialization and in particular after we call neigh_sysctl_register from ndisc_init. "neigh" needs to be initialized in ipv6_static_sysctl_register which is the first ipv6 table to initialized, and definitely before ndisc_init. This removes the weirdness of duplicate tables while still providing a "neigh" mount point which prevents races in sysctl unregistering. This was initially reported at https://bugzilla.kernel.org/show_bug.cgi?id=31232 Reported-by: sunkan@zappa.cx Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv6/sysctl_net_ipv6.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 7cb65ef79f9c..6dcf5e7d661b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,16 @@ static struct ctl_table empty[1]; +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + static ctl_table ipv6_table_template[] = { { .procname = "route", @@ -37,12 +47,6 @@ static ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "neigh", - .maxlen = 0, - .mode = 0555, - .child = empty, - }, { } }; @@ -160,7 +164,7 @@ static struct ctl_table_header *ip6_base; int ipv6_static_sysctl_register(void) { - ip6_base = register_sysctl_paths(net_ipv6_ctl_path, empty); + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); if (ip6_base == NULL) return -ENOMEM; return 0; -- cgit From f40f94fc6c3b5a5542d5ed976e9ff69a3b463802 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Mar 2011 10:15:40 +0000 Subject: ipvs: fix a typo in __ip_vs_control_init() Reported-by: Ingo Molnar Signed-off-by: Eric Dumazet Cc: Simon Horman Cc: Julian Anastasov Acked-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index b799cea31f95..33733c8872e7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3605,7 +3605,7 @@ int __net_init __ip_vs_control_init(struct net *net) /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (ipvs->tot_stats.cpustats) { + if (!ipvs->tot_stats.cpustats) { pr_err("%s(): alloc_percpu.\n", __func__); return -ENOMEM; } -- cgit From 736561a01f11114146b1b7f82d486fa9c95828ef Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 21 Mar 2011 15:18:01 +0000 Subject: IPVS: Use global mutex in ip_vs_app.c As part of the work to make IPVS network namespace aware __ip_vs_app_mutex was replaced by a per-namespace lock, ipvs->app_mutex. ipvs->app_key is also supplied for debugging purposes. Unfortunately this implementation results in ipvs->app_key residing in non-static storage which at the very least causes a lockdep warning. This patch takes the rather heavy-handed approach of reinstating __ip_vs_app_mutex which will cover access to the ipvs->list_head of all network namespaces. [ 12.610000] IPVS: Creating netns size=2456 id=0 [ 12.630000] IPVS: Registered protocols (TCP, UDP, SCTP, AH, ESP) [ 12.640000] BUG: key ffff880003bbf1a0 not in .data! [ 12.640000] ------------[ cut here ]------------ [ 12.640000] WARNING: at kernel/lockdep.c:2701 lockdep_init_map+0x37b/0x570() [ 12.640000] Hardware name: Bochs [ 12.640000] Pid: 1, comm: swapper Tainted: G W 2.6.38-kexec-06330-g69b7efe-dirty #122 [ 12.650000] Call Trace: [ 12.650000] [] warn_slowpath_common+0x75/0xb0 [ 12.650000] [] warn_slowpath_null+0x15/0x20 [ 12.650000] [] lockdep_init_map+0x37b/0x570 [ 12.650000] [] ? trace_hardirqs_on+0xd/0x10 [ 12.650000] [] debug_mutex_init+0x38/0x50 [ 12.650000] [] __mutex_init+0x5c/0x70 [ 12.650000] [] __ip_vs_app_init+0x64/0x86 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] T.620+0x43/0x170 [ 12.660000] [] ? register_pernet_subsys+0x1a/0x40 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.660000] [] register_pernet_operations+0x57/0xb0 [ 12.660000] [] ? ip_vs_init+0x0/0xff [ 12.670000] [] register_pernet_subsys+0x29/0x40 [ 12.670000] [] ip_vs_app_init+0x10/0x12 [ 12.670000] [] ip_vs_init+0x4c/0xff [ 12.670000] [] do_one_initcall+0x7a/0x12e [ 12.670000] [] kernel_init+0x13e/0x1c2 [ 12.670000] [] kernel_thread_helper+0x4/0x10 [ 12.670000] [] ? restore_args+0x0/0x30 [ 12.680000] [] ? kernel_init+0x0/0x1c2 [ 12.680000] [] ? kernel_thread_helper+0x0/0x1global0 Signed-off-by: Simon Horman Cc: Ingo Molnar Cc: Eric Dumazet Cc: Julian Anastasov Cc: Hans Schillstrom Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_app.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 5c48ffb60c28..2dc6de13ac18 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,6 +43,8 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); +static DEFINE_MUTEX(__ip_vs_app_mutex); + /* * Get an ip_vs_app object */ @@ -167,14 +169,13 @@ int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, __u16 port) { - struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return result; } @@ -189,11 +190,11 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); return 0; } @@ -205,10 +206,9 @@ int register_ip_vs_app(struct net *net, struct ip_vs_app *app) */ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { - struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { ip_vs_app_inc_release(net, inc); @@ -216,7 +216,7 @@ void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) list_del(&app->a_list); - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -501,7 +501,7 @@ static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); - mutex_lock(&ipvs->app_mutex); + mutex_lock(&__ip_vs_app_mutex); return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } @@ -535,9 +535,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); - - mutex_unlock(&ipvs->app_mutex); + mutex_unlock(&__ip_vs_app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -583,7 +581,6 @@ static int __net_init __ip_vs_app_init(struct net *net) struct netns_ipvs *ipvs = net_ipvs(net); INIT_LIST_HEAD(&ipvs->app_list); - __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } -- cgit From 27660515a21bf913e3208ded3f27abd0529fae0e Mon Sep 17 00:00:00 2001 From: Michał Mirosław Date: Fri, 18 Mar 2011 16:56:34 +0000 Subject: net: implement dev_disable_lro() hw_features compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement compatibility with new hw_features for dev_disable_lro(). This is a transition path - dev_disable_lro() should be later integrated into netdev_fix_features() after all drivers are converted. Signed-off-by: Michał Mirosław Signed-off-by: David S. Miller --- net/core/dev.c | 19 +++++++++++-------- net/core/ethtool.c | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0b88eba97dab..f453370131a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1353,14 +1353,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } + u32 flags; + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); WARN_ON(dev->features & NETIF_F_LRO); } EXPORT_SYMBOL(dev_disable_lro); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a1086fb0c0c7..24bd57493c0d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -513,7 +513,7 @@ static int ethtool_set_one_feature(struct net_device *dev, } } -static int __ethtool_set_flags(struct net_device *dev, u32 data) +int __ethtool_set_flags(struct net_device *dev, u32 data) { u32 changed; -- cgit From 74cb3c108bc0f599a4eb40980db8580cfba725c9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 19 Mar 2011 12:13:46 +0000 Subject: ipv4: match prefsrc when deleting routes fib_table_delete forgets to match the routes by prefsrc. Callers can specify known IP in fc_prefsrc and we should remove the exact route. This is needed for cases when same local or broadcast addresses are used in different subnets and the routes differ only in prefsrc. All callers that do not provide fc_prefsrc will ignore the route prefsrc as before and will delete the first occurence. That is how the ip route del default magic works. Current callers are: - ip_rt_ioctl where rtentry_to_fib_config provides fc_prefsrc only when the provided device name matches IP label with colon. - inet_rtm_delroute where RTA_PREFSRC is optional too - fib_magic which deals with routes when deleting addresses and where the fc_prefsrc is always set with the primary IP for the concerned IFA. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3d28a35c2e1a..ac87a49ad50b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1665,6 +1665,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_scope == cfg->fc_scope) && + (!cfg->fc_prefsrc || + fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(cfg, fi) == 0) { -- cgit From e6abbaa2725a43cf5d26c4c2a5dc6c0f6029ea19 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 19 Mar 2011 12:13:49 +0000 Subject: ipv4: fix route deletion for IPs on many subnets Alex Sidorenko reported for problems with local routes left after IP addresses are deleted. It happens when same IPs are used in more than one subnet for the device. Fix fib_del_ifaddr to restrict the checks for duplicate local and broadcast addresses only to the IFAs that use our primary IFA or another primary IFA with same address. And we expect the prefsrc to be matched when the routes are deleted because it is possible they to differ only by prefsrc. This patch prevents local and broadcast routes to be leaked until their primary IP is deleted finally from the box. As the secondary address promotion needs to delete the routes for all secondaries that used the old primary IFA, add option to ignore these secondaries from the checks and to assume they are already deleted, so that we can safely delete the route while these IFAs are still on the device list. Reported-by: Alex Sidorenko Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 101 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 88 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a373a259253c..02c3ba61884a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -722,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; - struct in_ifaddr *prim = ifa; + struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 @@ -735,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); - else { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; } /* Deletion is more complicated than add. @@ -755,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -763,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } } if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -896,7 +971,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); + fib_del_ifaddr(ifa, NULL); fib_update_nh_saddrs(dev); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. -- cgit From 2d230e2b2c3111cf4a11619f60dcd158ae84e3ab Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 19 Mar 2011 12:13:52 +0000 Subject: ipv4: remove the routes on secondary promotion The secondary address promotion relies on fib_sync_down_addr to remove all routes created for the secondary addresses when the old primary address is deleted. It does not happen for cases when the primary address is also in another subnet. Fix that by deleting local and broadcast routes for all secondaries while they are on device list and by faking that all addresses from this subnet are to be deleted. It relies on fib_del_ifaddr being able to ignore the IPs from the concerned subnet while checking for duplication. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6d85800daeb7..2523001f4c9a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -345,6 +345,17 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } } + /* On promotion all secondaries from subnet are changing + * the primary IP, we must remove all their routes silently + * and later to add them back with new prefsrc. Do this + * while all addresses are on the device list. + */ + for (ifa = promote; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) + fib_del_ifaddr(ifa, ifa1); + } + /* 2. Unlink it */ *ifap = ifa1->ifa_next; -- cgit From 04024b937a6e9b7d4320b5853557cea3930d528c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 19 Mar 2011 12:13:54 +0000 Subject: ipv4: optimize route adding on secondary promotion Optimize the calling of fib_add_ifaddr for all secondary addresses after the promoted one to start from their place, not from the new place of the promoted secondary. It will save some CPU cycles because we are sure the promoted secondary was first for the subnet and all next secondaries do not change their place. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 2523001f4c9a..d5a4553bebc3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -375,6 +375,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { + struct in_ifaddr *next_sec = promote->ifa_next; if (prev_prom) { prev_prom->ifa_next = promote->ifa_next; @@ -386,7 +387,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; -- cgit From a40c4f10e3fb96030358e49abd010c1f08446fa3 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 21 Mar 2011 15:07:16 -0700 Subject: libceph: add lingering request and watch/notify event framework Lingering requests are requests that are sent to the OSD normally but tracked also after we get a successful request. This keeps the OSD connection open and resends the original request if the object moves to another OSD. The OSD can then send notification messages back to us if another client initiates a notify. This framework will be used by RBD so that the client gets notification when a snapshot is created by another node or tool. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- net/ceph/ceph_common.c | 1 + net/ceph/osd_client.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 374 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f3e4a13fea0c..95f96ab94bba 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -62,6 +62,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_OSD_MAP: return "osd_map"; case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; default: return "unknown"; } } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b85ed5a5503d..02212ed50852 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -25,6 +25,12 @@ static const struct ceph_connection_operations osd_con_ops; static void send_queued(struct ceph_osd_client *osdc); static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static int __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); static int op_needs_trail(int op) { @@ -33,6 +39,7 @@ static int op_needs_trail(int op) case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY: return 1; default: return 0; @@ -208,6 +215,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd); req->r_flags = flags; WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); @@ -314,6 +323,24 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_STARTSYNC: break; + case CEPH_OSD_OP_NOTIFY: + { + __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); + __le32 timeout = cpu_to_le32(src->watch.timeout); + + BUG_ON(!req->r_trail); + + ceph_pagelist_append(req->r_trail, + &prot_ver, sizeof(prot_ver)); + ceph_pagelist_append(req->r_trail, + &timeout, sizeof(timeout)); + } + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; default: pr_err("unrecognized osd opcode %d\n", dst->op); WARN_ON(1); @@ -534,7 +561,7 @@ __lookup_request_ge(struct ceph_osd_client *osdc, static void __kick_osd_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - struct ceph_osd_request *req; + struct ceph_osd_request *req, *nreq; int err; dout("__kick_osd_requests osd%d\n", osd->o_osd); @@ -546,7 +573,17 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, list_move(&req->r_req_lru_item, &osdc->req_unsent); dout("requeued %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); - req->r_flags |= CEPH_OSD_FLAG_RETRY; + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, + r_linger_osd) { + __unregister_linger_request(osdc, req); + __register_request(osdc, req); + list_move(&req->r_req_lru_item, &osdc->req_unsent); + dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, + osd->o_osd); } } @@ -590,6 +627,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) atomic_set(&osd->o_ref, 1); osd->o_osdc = osdc; INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_osd_lru); osd->o_incarnation = 1; @@ -679,7 +717,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) int ret = 0; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests)) { + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], &osd->o_con.peer_addr, @@ -752,10 +791,9 @@ static void __cancel_osd_timeout(struct ceph_osd_client *osdc) * Register request, assign tid. If this is the first request, set up * the timeout event. */ -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) { - mutex_lock(&osdc->request_mutex); req->r_tid = ++osdc->last_tid; req->r_request->hdr.tid = cpu_to_le64(req->r_tid); INIT_LIST_HEAD(&req->r_req_lru_item); @@ -769,6 +807,13 @@ static void register_request(struct ceph_osd_client *osdc, dout(" first request, scheduling timeout\n"); __schedule_osd_timeout(osdc); } +} + +static void register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + __register_request(osdc, req); mutex_unlock(&osdc->request_mutex); } @@ -787,9 +832,14 @@ static void __unregister_request(struct ceph_osd_client *osdc, ceph_con_revoke(&req->r_osd->o_con, req->r_request); list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests)) + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); __move_osd_to_lru(osdc, req->r_osd); - req->r_osd = NULL; + } + if (list_empty(&req->r_osd_item) && + list_empty(&req->r_linger_item)) + req->r_osd = NULL; } ceph_osdc_put_request(req); @@ -812,6 +862,58 @@ static void __cancel_request(struct ceph_osd_request *req) } } +static void __register_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__register_linger_request %p\n", req); + list_add_tail(&req->r_linger_item, &osdc->req_linger); + list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("__unregister_linger_request %p\n", req); + if (req->r_osd) { + list_del_init(&req->r_linger_item); + list_del_init(&req->r_linger_osd); + + if (list_empty(&req->r_osd->o_requests) && + list_empty(&req->r_osd->o_linger_requests)) { + dout("moving osd to %p lru\n", req->r_osd); + __move_osd_to_lru(osdc, req->r_osd); + } + req->r_osd = NULL; + } +} + +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + mutex_lock(&osdc->request_mutex); + if (req->r_linger) { + __unregister_linger_request(osdc, req); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); +} +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (!req->r_linger) { + dout("set_request_linger %p\n", req); + req->r_linger = 1; + /* + * caller is now responsible for calling + * unregister_linger_request + */ + ceph_osdc_get_request(req); + } +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is @@ -958,7 +1060,6 @@ static void handle_timeout(struct work_struct *work) osdc->client->options->osd_keepalive_timeout * HZ; unsigned long last_stamp = 0; struct list_head slow_osds; - dout("timeout\n"); down_read(&osdc->map_sem); @@ -1060,7 +1161,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, numops * sizeof(struct ceph_osd_op)) goto bad; dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); - /* lookup */ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); @@ -1104,6 +1204,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, dout("handle_reply tid %llu flags %d\n", tid, flags); + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) + __register_linger_request(osdc, req); + /* either this is a read, or we got the safe response */ if (result < 0 || (flags & CEPH_OSD_FLAG_ONDISK) || @@ -1124,6 +1227,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, } done: + dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; @@ -1159,7 +1263,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) */ static void kick_requests(struct ceph_osd_client *osdc) { - struct ceph_osd_request *req; + struct ceph_osd_request *req, *nreq; struct rb_node *p; int needmap = 0; int err; @@ -1177,8 +1281,30 @@ static void kick_requests(struct ceph_osd_client *osdc) } else if (err > 0) { dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; + if (!req->r_linger) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, + r_linger_item) { + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); + + err = __map_request(osdc, req); + if (err == 0) + continue; /* no change and no osd was specified */ + if (err < 0) + continue; /* hrm! */ + if (req->r_osd == NULL) { + dout("tid %llu maps to no valid osd\n", req->r_tid); + needmap++; /* request a newer map */ + continue; } + + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + __unregister_linger_request(osdc, req); + __register_request(osdc, req); } mutex_unlock(&osdc->request_mutex); @@ -1301,6 +1427,223 @@ bad: return; } +/* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ + struct ceph_osd_event *event = + container_of(kref, struct ceph_osd_event, kref); + + dout("__release_event %p\n", event); + kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ + kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ + kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, + struct ceph_osd_event *new) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (new->cookie < event->cookie) + p = &(*p)->rb_left; + else if (new->cookie > event->cookie) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, + u64 cookie) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (cookie < event->cookie) + p = &(*p)->rb_left; + else if (cookie > event->cookie) + p = &(*p)->rb_right; + else + return event; + } + return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + if (!RB_EMPTY_NODE(&event->node)) { + dout("__remove_event removed %p\n", event); + rb_erase(&event->node, &osdc->event_tree); + ceph_osdc_put_event(event); + } else { + dout("__remove_event didn't remove %p\n", event); + } +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + int one_shot, void *data, + struct ceph_osd_event **pevent) +{ + struct ceph_osd_event *event; + + event = kmalloc(sizeof(*event), GFP_NOIO); + if (!event) + return -ENOMEM; + + dout("create_event %p\n", event); + event->cb = event_cb; + event->one_shot = one_shot; + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + init_completion(&event->completion); + + spin_lock(&osdc->event_lock); + event->cookie = ++osdc->event_count; + __insert_event(osdc, event); + spin_unlock(&osdc->event_lock); + + *pevent = event; + return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + dout("cancel_event %p\n", event); + spin_lock(&osdc->event_lock); + __remove_event(event); + spin_unlock(&osdc->event_lock); + ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ + struct ceph_osd_event_work *event_work = + container_of(work, struct ceph_osd_event_work, work); + struct ceph_osd_event *event = event_work->event; + u64 ver = event_work->ver; + u64 notify_id = event_work->notify_id; + u8 opcode = event_work->opcode; + + dout("do_event_work completing %p\n", event); + event->cb(ver, notify_id, opcode, event->data); + complete(&event->completion); + dout("do_event_work completed %p\n", event); + ceph_osdc_put_event(event); + kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end; + u8 proto_ver; + u64 cookie, ver, notify_id; + u8 opcode; + struct ceph_osd_event *event; + struct ceph_osd_event_work *event_work; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + ceph_decode_64_safe(&p, end, ver, bad); + ceph_decode_64_safe(&p, end, notify_id, bad); + + spin_lock(&osdc->event_lock); + event = __find_event(osdc, cookie); + if (event) { + get_event(event); + if (event->one_shot) + __remove_event(event); + } + spin_unlock(&osdc->event_lock); + dout("handle_watch_notify cookie %lld ver %lld event %p\n", + cookie, ver, event); + if (event) { + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); + INIT_WORK(&event_work->work, do_event_work); + if (!event_work) { + dout("ERROR: could not allocate event_work\n"); + goto done_err; + } + event_work->event = event; + event_work->ver = ver; + event_work->notify_id = notify_id; + event_work->opcode = opcode; + if (!queue_work(osdc->notify_wq, &event_work->work)) { + dout("WARNING: failed to queue notify event work\n"); + goto done_err; + } + } + + return; + +done_err: + complete(&event->completion); + ceph_osdc_put_event(event); + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); + return; +} + +int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) +{ + int err; + + dout("wait_event %p\n", event); + err = wait_for_completion_interruptible_timeout(&event->completion, + timeout * HZ); + ceph_osdc_put_event(event); + if (err > 0) + err = 0; + dout("wait_event %p returns %d\n", event, err); + return err; +} +EXPORT_SYMBOL(ceph_osdc_wait_event); + /* * Register request, send initial attempt. */ @@ -1430,9 +1773,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) INIT_LIST_HEAD(&osdc->req_lru); INIT_LIST_HEAD(&osdc->req_unsent); INIT_LIST_HEAD(&osdc->req_notarget); + INIT_LIST_HEAD(&osdc->req_linger); osdc->num_requests = 0; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + spin_lock_init(&osdc->event_lock); + osdc->event_tree = RB_ROOT; + osdc->event_count = 0; schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); @@ -1452,6 +1799,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) "osd_op_reply"); if (err < 0) goto out_msgpool; + + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); + if (IS_ERR(osdc->notify_wq)) { + err = PTR_ERR(osdc->notify_wq); + osdc->notify_wq = NULL; + goto out_msgpool; + } return 0; out_msgpool: @@ -1465,6 +1819,8 @@ EXPORT_SYMBOL(ceph_osdc_init); void ceph_osdc_stop(struct ceph_osd_client *osdc) { + flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); if (osdc->osdmap) { @@ -1472,6 +1828,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) osdc->osdmap = NULL; } remove_old_osds(osdc, 1); + WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); @@ -1580,6 +1937,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_OSD_OPREPLY: handle_reply(osdc, msg, con); break; + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(osdc, msg); + break; default: pr_err("received unknown message type %d %s\n", type, @@ -1673,6 +2033,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: + case CEPH_MSG_WATCH_NOTIFY: return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); -- cgit From eeff66ef6e33925f615d49e6c846263e342ab60e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Mar 2011 16:39:47 +0530 Subject: net/9p: Convert the in the 9p rpc call path to GFP_NOFS Without this we can cause reclaim allocation in writepage. [ 3433.448430] ================================= [ 3433.449117] [ INFO: inconsistent lock state ] [ 3433.449117] 2.6.38-rc5+ #84 [ 3433.449117] --------------------------------- [ 3433.449117] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-R} usage. [ 3433.449117] kswapd0/505 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 3433.449117] (iprune_sem){+++++-}, at: [] shrink_icache_memory+0x45/0x2b1 [ 3433.449117] {RECLAIM_FS-ON-W} state was registered at: [ 3433.449117] [] mark_held_locks+0x52/0x70 [ 3433.449117] [] lockdep_trace_alloc+0x85/0x9f [ 3433.449117] [] slab_pre_alloc_hook+0x18/0x3c [ 3433.449117] [] kmem_cache_alloc+0x23/0xa2 [ 3433.449117] [] idr_pre_get+0x2d/0x6f [ 3433.449117] [] p9_idpool_get+0x30/0xae [ 3433.449117] [] p9_client_rpc+0xd7/0x9b0 [ 3433.449117] [] p9_client_clunk+0x88/0xdb [ 3433.449117] [] v9fs_evict_inode+0x3c/0x48 [ 3433.449117] [] evict+0x1f/0x87 [ 3433.449117] [] dispose_list+0x47/0xe3 [ 3433.449117] [] evict_inodes+0x138/0x14f [ 3433.449117] [] generic_shutdown_super+0x57/0xe8 [ 3433.449117] [] kill_anon_super+0x11/0x50 [ 3433.449117] [] v9fs_kill_super+0x49/0xab [ 3433.449117] [] deactivate_locked_super+0x21/0x46 [ 3433.449117] [] deactivate_super+0x40/0x44 [ 3433.449117] [] mntput_no_expire+0x100/0x109 [ 3433.449117] [] sys_umount+0x2f1/0x31c [ 3433.449117] [] system_call_fastpath+0x16/0x1b [ 3433.449117] irq event stamp: 192941 [ 3433.449117] hardirqs last enabled at (192941): [] _raw_spin_unlock_irq+0x2b/0x30 [ 3433.449117] hardirqs last disabled at (192940): [] shrink_inactive_list+0x290/0x2f5 [ 3433.449117] softirqs last enabled at (188470): [] __do_softirq+0x133/0x152 [ 3433.449117] softirqs last disabled at (188455): [] call_softirq+0x1c/0x28 [ 3433.449117] [ 3433.449117] other info that might help us debug this: [ 3433.449117] 1 lock held by kswapd0/505: [ 3433.449117] #0: (shrinker_rwsem){++++..}, at: [] shrink_slab+0x38/0x15f [ 3433.449117] [ 3433.449117] stack backtrace: [ 3433.449117] Pid: 505, comm: kswapd0 Not tainted 2.6.38-rc5+ #84 [ 3433.449117] Call Trace: [ 3433.449117] [] ? valid_state+0x17e/0x191 [ 3433.449117] [] ? save_stack_trace+0x28/0x45 [ 3433.449117] [] ? check_usage_forwards+0x0/0x87 [ 3433.449117] [] ? mark_lock+0x113/0x22c [ 3433.449117] [] ? __lock_acquire+0x37a/0xcf7 [ 3433.449117] [] ? mark_lock+0x2d/0x22c [ 3433.449117] [] ? __lock_acquire+0x392/0xcf7 [ 3433.449117] [] ? determine_dirtyable_memory+0x15/0x28 [ 3433.449117] [] ? lock_acquire+0x57/0x6d [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? down_read+0x47/0x5c [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? shrink_icache_memory+0x45/0x2b1 [ 3433.449117] [] ? shrink_slab+0xdb/0x15f [ 3433.449117] [] ? kswapd+0x574/0x96a [ 3433.449117] [] ? kswapd+0x0/0x96a [ 3433.449117] [] ? kthread+0x7d/0x85 [ 3433.449117] [] ? kernel_thread_helper+0x4/0x10 [ 3433.449117] [] ? restore_args+0x0/0x30 [ 3433.449117] [] ? kthread+0x0/0x85 [ 3433.449117] [] ? kernel_thread_helper+0x0/0x10 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 10 +++++----- net/9p/protocol.c | 6 +++--- net/9p/trans_fd.c | 2 +- net/9p/trans_rdma.c | 6 +++--- net/9p/util.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 347ec0cd2718..2ccbf04d37df 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -223,7 +223,7 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) req = &c->reqs[row][col]; if (!req->tc) { - req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); + req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS); if (!req->wq) { printk(KERN_ERR "Couldn't grow tag array\n"); return ERR_PTR(-ENOMEM); @@ -233,17 +233,17 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) P9_TRANS_PREF_PAYLOAD_SEP) { int alloc_msize = min(c->msize, 4096); req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = alloc_msize; req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = alloc_msize; } else { req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->tc->capacity = c->msize; req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize, - GFP_KERNEL); + GFP_NOFS); req->rc->capacity = c->msize; } if ((!req->tc) || (!req->rc)) { diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 2ce515b859b3..8a4084fa8b5a 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -205,7 +205,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (errcode) break; - *sptr = kmalloc(len + 1, GFP_KERNEL); + *sptr = kmalloc(len + 1, GFP_NOFS); if (*sptr == NULL) { errcode = -EFAULT; break; @@ -273,7 +273,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, if (!errcode) { *wnames = kmalloc(sizeof(char *) * *nwname, - GFP_KERNEL); + GFP_NOFS); if (!*wnames) errcode = -ENOMEM; } @@ -317,7 +317,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, *wqids = kmalloc(*nwqid * sizeof(struct p9_qid), - GFP_KERNEL); + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index a30471e51740..aa5672b15eae 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -350,7 +350,7 @@ static void p9_read_work(struct work_struct *work) if (m->req->rc == NULL) { m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_KERNEL); + m->client->msize, GFP_NOFS); if (!m->req->rc) { m->req = NULL; err = -ENOMEM; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 29a54ccd213d..150e0c4bbf40 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -424,7 +424,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) struct p9_rdma_context *rpl_context = NULL; /* Allocate an fcall for the reply */ - rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); + rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto err_close; @@ -437,7 +437,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) */ if (!req->rc) { req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, - GFP_KERNEL); + GFP_NOFS); if (req->rc) { req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall); @@ -468,7 +468,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) req->rc = NULL; /* Post the request */ - c = kmalloc(sizeof *c, GFP_KERNEL); + c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto err_free1; diff --git a/net/9p/util.c b/net/9p/util.c index e048701a72d2..b84619b5ba22 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -92,7 +92,7 @@ int p9_idpool_get(struct p9_idpool *p) unsigned long flags; retry: - if (idr_pre_get(&p->pool, GFP_KERNEL) == 0) + if (idr_pre_get(&p->pool, GFP_NOFS) == 0) return 0; spin_lock_irqsave(&p->lock, flags); -- cgit From 472e7f9f8b547605ee9670ac803e971c2e3eeac0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Mar 2011 16:39:47 +0530 Subject: net/9p: Fix compile warning Signed-off-by: Aneesh Kumar K.V Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index d62b9aa58df8..9172ab78fcb0 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -41,9 +41,9 @@ EXPORT_SYMBOL(p9_release_req_pages); int p9_nr_pages(struct p9_req_t *req) { - int start_page, end_page; - start_page = (unsigned long long)req->tc->pubuf >> PAGE_SHIFT; - end_page = ((unsigned long long)req->tc->pubuf + req->tc->pbuf_size + + unsigned long start_page, end_page; + start_page = (unsigned long)req->tc->pubuf >> PAGE_SHIFT; + end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; return end_page - start_page; } @@ -69,8 +69,8 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, *pdata_off = (size_t)req->tc->pubuf & (PAGE_SIZE-1); if (*pdata_off) - first_page_bytes = min((PAGE_SIZE - *pdata_off), - req->tc->pbuf_size); + first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off), + req->tc->pbuf_size); rpinfo = req->tc->private; pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, -- cgit From 53bda3e5b4e91763224ecb7d05dab94d281fd41d Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Tue, 8 Mar 2011 15:34:20 -0800 Subject: [net/9p] unconditional wake_up to proc waiting for space on VirtIO ring Process may wait to get space on VirtIO ring to send a transaction to VirtFS server. Current code just does a conditional wake_up() which means only one process will be woken up even if multiple processes are waiting. This fix makes the wake_up unconditional. Hence we won't have any processes waiting for-ever. Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 9b550ed9c711..961e025957ae 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -146,11 +146,10 @@ static void req_done(struct virtqueue *vq) rc = virtqueue_get_buf(chan->vq, &len); if (rc != NULL) { - if (!chan->ring_bufs_avail) { - chan->ring_bufs_avail = 1; - wake_up(chan->vc_wq); - } + chan->ring_bufs_avail = 1; spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); -- cgit From a01a984035ea799b14aa5e874dcaeb122f09c4b4 Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Mon, 14 Mar 2011 14:12:49 -0700 Subject: [net/9p] Set the condition just before waking up. Given that the sprious wake-ups are common, we need to move the condition setting right next to the wake_up(). After setting the condition to req->status = REQ_STATUS_RCVD, sprious wakeups may cause the virtqueue back on the free list for someone else to use. This may result in kernel panic while relasing the pinned pages in p9_release_req_pages(). Also rearranged the while loop in req_done() for better redability. Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 961e025957ae..cb98af9367ab 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -141,33 +141,33 @@ static void req_done(struct virtqueue *vq) P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n"); - do { + while (1) { spin_lock_irqsave(&chan->lock, flags); rc = virtqueue_get_buf(chan->vq, &len); - if (rc != NULL) { - chan->ring_bufs_avail = 1; - spin_unlock_irqrestore(&chan->lock, flags); - /* Wakeup if anyone waiting for VirtIO ring space. */ - wake_up(chan->vc_wq); - P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); - P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", - rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); - req->status = REQ_STATUS_RCVD; - if (req->tc->private) { - struct trans_rpage_info *rp = req->tc->private; - /*Release pages */ - p9_release_req_pages(rp); - if (rp->rp_alloc) - kfree(rp); - req->tc->private = NULL; - } - p9_client_cb(chan->client, req); - } else { + if (rc == NULL) { spin_unlock_irqrestore(&chan->lock, flags); + break; } - } while (rc != NULL); + + chan->ring_bufs_avail = 1; + spin_unlock_irqrestore(&chan->lock, flags); + /* Wakeup if anyone waiting for VirtIO ring space. */ + wake_up(chan->vc_wq); + P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc); + P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); + req = p9_tag_lookup(chan->client, rc->tag); + if (req->tc->private) { + struct trans_rpage_info *rp = req->tc->private; + /*Release pages */ + p9_release_req_pages(rp); + if (rp->rp_alloc) + kfree(rp); + req->tc->private = NULL; + } + req->status = REQ_STATUS_RCVD; + p9_client_cb(chan->client, req); + } } /** -- cgit From 316ad5501c2098cb2a2a25ed77a0421f1671411c Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Mon, 14 Mar 2011 14:22:41 -0700 Subject: [net/9p] Don't re-pin pages on retrying virtqueue_add_buf(). Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index cb98af9367ab..c6e1ae2fb926 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -262,7 +262,6 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n"); -req_retry: req->status = REQ_STATUS_SENT; if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) { @@ -295,6 +294,7 @@ req_retry: } } +req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); /* Handle out VirtIO ring buffers */ @@ -355,7 +355,7 @@ req_retry: return err; P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n"); - goto req_retry; + goto req_retry_pinned; } else { spin_unlock_irqrestore(&chan->lock, flags); P9_DPRINTK(P9_DEBUG_TRANS, -- cgit From 68da9ba4eeadae86ad42e52b80822fbd56971267 Mon Sep 17 00:00:00 2001 From: "Venkateswararao Jujjuri (JV)" Date: Fri, 18 Mar 2011 15:49:48 -0700 Subject: [net/9p]: Introduce basic flow-control for VirtIO transport. Recent zerocopy work in the 9P VirtIO transport maps and pins user buffers into kernel memory for the server to work on them. Since the user process can initiate this kind of pinning with a simple read/write call, thousands of IO threads initiated by the user process can hog the system resources and could result into denial of service. This patch introduces flow control to avoid that extreme scenario. The ceiling limit to avoid denial of service attacks is set to relatively high (nr_free_pagecache_pages()/4) so that it won't interfere with regular usage, but can step in extreme cases to limit the total system hang. Since we don't have a global structure to accommodate this variable, I choose the virtio_chan as the home for this. Signed-off-by: Venkateswararao Jujjuri Reviewed-by: Badari Pulavarty Signed-off-by: Eric Van Hensbergen --- net/9p/trans_virtio.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index c6e1ae2fb926..e8f046b07182 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include "trans_common.h" @@ -51,6 +52,8 @@ /* a single mutex to manage channel initialization and attachment */ static DEFINE_MUTEX(virtio_9p_lock); +static DECLARE_WAIT_QUEUE_HEAD(vp_wq); +static atomic_t vp_pinned = ATOMIC_INIT(0); /** * struct virtio_chan - per-instance transport information @@ -78,7 +81,10 @@ struct virtio_chan { struct virtqueue *vq; int ring_bufs_avail; wait_queue_head_t *vc_wq; - + /* This is global limit. Since we don't have a global structure, + * will be placing it in each channel. + */ + int p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; @@ -159,8 +165,11 @@ static void req_done(struct virtqueue *vq) req = p9_tag_lookup(chan->client, rc->tag); if (req->tc->private) { struct trans_rpage_info *rp = req->tc->private; + int p = rp->rp_nr_pages; /*Release pages */ p9_release_req_pages(rp); + atomic_sub(p, &vp_pinned); + wake_up(&vp_wq); if (rp->rp_alloc) kfree(rp); req->tc->private = NULL; @@ -269,6 +278,14 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) int rpinfo_size = sizeof(struct trans_rpage_info) + sizeof(struct page *) * nr_pages; + if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { + err = wait_event_interruptible(vp_wq, + atomic_read(&vp_pinned) < chan->p9_max_pages); + if (err == -ERESTARTSYS) + return err; + P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n"); + } + if (rpinfo_size <= (req->tc->capacity - req->tc->size)) { /* We can use sdata */ req->tc->private = req->tc->sdata + req->tc->size; @@ -291,6 +308,8 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) if (rpinfo->rp_alloc) kfree(rpinfo); return err; + } else { + atomic_add(rpinfo->rp_nr_pages, &vp_pinned); } } @@ -452,6 +471,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; + /* Ceiling limit to avoid denial of service attacks */ + chan->p9_max_pages = nr_free_buffer_pages()/4; mutex_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); -- cgit From 246408dcd5dfeef2df437ccb0ef4d6ee87805f58 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Mar 2011 18:40:10 -0400 Subject: SUNRPC: Never reuse the socket port after an xs_close() If we call xs_close(), we're in one of two situations: - Autoclose, which means we don't expect to resend a request - bind+connect failed, which probably means the port is in use Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- net/sunrpc/xprtsock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index be96d429b475..1e336a06d3e6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -710,6 +710,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; + transport->srcport = 0; + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; -- cgit From 94dcf29a11b3d20a28790598d701f98484a969da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:45 -0700 Subject: kthread: use kthread_create_on_node() ksoftirqd, kworker, migration, and pktgend kthreads can be created with kthread_create_on_node(), to get proper NUMA affinities for their stack and task_struct. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Acked-by: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/pktgen.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0c55eaa70e39..aeeece72b72f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3761,7 +3761,10 @@ static int __init pktgen_create_thread(int cpu) list_add_tail(&t->th_list, &pktgen_threads); init_completion(&t->start_done); - p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); if (IS_ERR(p)) { pr_err("kernel_thread() failed for cpu %d\n", t->cpu); list_del(&t->th_list); -- cgit From 9c7a4f9ce651383c73dfdff3d7e21d5f9572c4ec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2011 19:17:36 -0700 Subject: ipv6: ip6_route_output does not modify sk parameter, so make it const This avoids explicit cast to avoid 'discards qualifiers' compiler warning in a netfilter patch that i've been working on. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6814c8722fa7..843406f14d7b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -854,7 +854,7 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, struct sock *sk, +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; -- cgit From a7bff75b087e7a355838a32efe61707cfa73c194 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 22 Mar 2011 11:40:32 +0000 Subject: bridge: Fix possibly wrong MLD queries' ethernet source address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ipv6_dev_get_saddr() is currently called with an uninitialized destination address. Although in tests it usually seemed to nevertheless always fetch the right source address, there seems to be a possible race condition. Therefore this commit changes this, first setting the destination address and only after that fetching the source address. Reported-by: Jan Beulich Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 030a002ff8ee..f61eb2eff3fd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -445,9 +445,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->payload_len = htons(8 + sizeof(*mldq)); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; + ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, &ip6h->saddr); - ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); -- cgit From 67d4120a1793138bc9f4a6eb61d0fc5298ed97e0 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 14 Mar 2011 10:57:03 +0000 Subject: tcp: avoid cwnd moderation in undo In the current undo logic, cwnd is moderated after it was restored to the value prior entering fast-recovery. It was moderated first in tcp_try_undo_recovery then again in tcp_complete_cwr. Since the undo indicates recovery was false, these moderations are not necessary. If the undo is triggered when most of the outstanding data have been acknowledged, the (restored) cwnd is falsely pulled down to a small value. This patch removes these cwnd moderations if cwnd is undone a) during fast-recovery b) by receiving DSACKs past fast-recovery Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index da782e7ab16d..e72af7e31d1c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2659,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const int undo) +static void tcp_undo_cwr(struct sock *sk, const int undo_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); @@ -2671,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2822,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk) static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; + /* Do not moderate cwnd if it's already undone in cwr or recovery */ + if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; + } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -- cgit From f6152737a95bd6c22f0c664b20831aefd48085a8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 22 Mar 2011 19:37:11 -0700 Subject: tcp: Make undo_ssthresh arg to tcp_undo_cwr() a bool. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e72af7e31d1c..bef9f04c22ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2659,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const int undo_ssthresh) +static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); @@ -2698,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else @@ -2725,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } @@ -2778,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, 0); + tcp_undo_cwr(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -2807,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; @@ -3496,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); } /* F-RTO spurious RTO detection algorithm (RFC4138) -- cgit From 406b6f974dae76a5b795d5c251d11c979a4e509b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 22 Mar 2011 21:56:23 -0700 Subject: ipv4: Fallback to FIB local table in __ip_dev_find(). In commit 9435eb1cf0b76b323019cebf8d16762a50a12a19 ("ipv4: Implement __ip_dev_find using new interface address hash.") we reimplemented __ip_dev_find() so that it doesn't have to do a full FIB table lookup. Instead, it consults a hash table of addresses configured to interfaces. This works identically to the old code in all except one case, and that is for loopback subnets. The old code would match the loopback device for any IP address that falls within a subnet configured to the loopback device. Handle this corner case by doing the FIB lookup. We could implement this via inet_addr_onlink() but: 1) Someone could configure many addresses to loopback and inet_addr_onlink() is a simple list traversal. 2) We know the old code works. Reported-by: Julian Anastasov Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d5a4553bebc3..5345b0bee6df 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -64,6 +64,8 @@ #include #include +#include "fib_lookup.h" + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -151,6 +153,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) break; } } + if (!result) { + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res = { 0 }; + struct fib_table *local; + + /* Fallback to FIB local table so that communication + * over loopback subnets work. + */ + local = fib_get_table(net, RT_TABLE_LOCAL); + if (local && + !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && + res.type == RTN_LOCAL) + result = FIB_RES_DEV(res); + } if (result && devref) dev_hold(result); rcu_read_unlock(); -- cgit From eb49a97363f020c1d7eef8bcd93865726b1fa11d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Mar 2011 12:18:15 -0700 Subject: ipv4: fix ip_rt_update_pmtu() commit 2c8cec5c10bc (Cache learned PMTU information in inetpeer) added an extra inet_putpeer() call in ip_rt_update_pmtu(). This results in various problems, since we can free one inetpeer, while it is still in use. Ref: http://www.spinics.net/lists/netdev/msg159121.html Reported-by: Alexander Beregalov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 870b5182ddd8..34921b0d3f8e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1593,8 +1593,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) rt->rt_peer_genid = rt_peer_genid(); } check_peer_pmtu(dst, peer); - - inet_putpeer(peer); } } -- cgit From 12ce22423abacca70bf1dfbcb8543b3e2b74aad4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:41:45 -0700 Subject: rds: stop including asm-generic/bitops/le.h directly asm-generic/bitops/le.h is only intended to be included directly from asm-generic/bitops/ext2-non-atomic.h or asm-generic/bitops/minix-le.h which implements generic ext2 or minix bit operations. This stops including asm-generic/bitops/le.h directly and use ext2 non-atomic bit operations instead. It seems odd to use ext2_*_bit() on rds, but it will replaced with __{set,clear,test}_bit_le() after introducing little endian bit operations for all architectures. This indirect step is necessary to maintain bisectability for some architectures which have their own little-endian bit operations. Signed-off-by: Akinobu Mita Cc: Andy Grover Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/rds/cong.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/cong.c b/net/rds/cong.c index 75ea686f27d5..8cc322bfd092 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -33,8 +33,7 @@ #include #include #include - -#include +#include #include "rds.h" @@ -285,7 +284,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___set_le_bit(off, (void *)map->m_page_addrs[i]); + ext2_set_bit(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -299,7 +298,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); + ext2_clear_bit(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -310,7 +309,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); + return ext2_test_bit(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) -- cgit From e1dc1c81b9d1c823f2a529b9b9cf8bf5dacbce6a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:05 -0700 Subject: rds: use little-endian bitops As a preparation for removing ext2 non-atomic bit operations from asm/bitops.h. This converts ext2 non-atomic bit operations to little-endian bit operations. Signed-off-by: Akinobu Mita Cc: Andy Grover Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/rds/cong.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/cong.c b/net/rds/cong.c index 8cc322bfd092..6daaa49d133f 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -284,7 +284,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - ext2_set_bit(off, (void *)map->m_page_addrs[i]); + __set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) @@ -298,7 +298,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - ext2_clear_bit(off, (void *)map->m_page_addrs[i]); + __clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -309,7 +309,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - return ext2_test_bit(off, (void *)map->m_page_addrs[i]); + return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) -- cgit From 7ebb931598cd95cccea10d4bc4c0123a464ea565 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:30 +0000 Subject: NFS: use secinfo when crossing mountpoints A submount may use different security than the parent mount does. We should figure out what sec flavor the submount uses at mount time. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_mech_switch.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 8b4061049d76..6c844b01a1d1 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -160,6 +160,28 @@ gss_mech_get_by_name(const char *name) EXPORT_SYMBOL_GPL(gss_mech_get_by_name); +struct gss_api_mech * +gss_mech_get_by_OID(struct xdr_netobj *obj) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (obj->len == pos->gm_oid.len) { + if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_OID); + static inline int mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) { -- cgit From 8f70e95f9f4159184f557a1db60c909d7c1bd2e3 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:31 +0000 Subject: NFS: Determine initial mount security When sec= is not presented as a mount option, we should attempt to determine what security flavor the server is using. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_mech_switch.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 6c844b01a1d1..e3c36a274412 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -215,6 +215,22 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); +int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) +{ + struct gss_api_mech *pos = NULL; + int i = 0; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + array_ptr[i] = pos->gm_pfs->pseudoflavor; + i++; + } + spin_unlock(®istered_mechs_lock); + return i; +} + +EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors); + u32 gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) { -- cgit From fcd13f42c9d6ab7b1024b9b7125a2e8db3cc00b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Mar 2011 07:01:24 +0000 Subject: ipv4: fix fib metrics Alessandro Suardi reported that we could not change route metrics : ip ro change default .... advmss 1400 This regression came with commit 9c150e82ac50 (Allocate fib metrics dynamically). fib_metrics is no longer an array, but a pointer to an array. Reported-by: Alessandro Suardi Signed-off-by: Eric Dumazet Tested-by: Alessandro Suardi Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 622ac4c95026..75b9fb5d5d37 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -251,7 +251,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, - sizeof(fi->fib_metrics)) == 0 && + sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; -- cgit From 52cb1c0d208565d8f06b743cdc6596d744f92e3b Mon Sep 17 00:00:00 2001 From: Suraj Sumangala Date: Wed, 9 Mar 2011 14:44:05 +0530 Subject: Bluetooth: Increment unacked_frames count only the first transmit This patch lets 'l2cap_pinfo.unacked_frames' be incremented only the first time a frame is transmitted. Previously it was being incremented for retransmitted packets too resulting the value to cross the transmit window size. Signed-off-by: Suraj Sumangala Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index c9f9cecca527..ca27f3a41536 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1116,7 +1116,9 @@ int l2cap_ertm_send(struct sock *sk) bt_cb(skb)->tx_seq = pi->next_tx_seq; pi->next_tx_seq = (pi->next_tx_seq + 1) % 64; - pi->unacked_frames++; + if (bt_cb(skb)->retries == 1) + pi->unacked_frames++; + pi->frames_sent++; if (skb_queue_is_last(TX_QUEUE(sk), skb)) -- cgit From f630cf0d5434e3923e1b8226ffa2753ead6b0ce5 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Wed, 16 Mar 2011 15:36:29 -0300 Subject: Bluetooth: Fix HCI_RESET command synchronization We can't send new commands before a cmd_complete for the HCI_RESET command shows up. Reported-by: Mikko Vinni Reported-by: Justin P. Mattock Reported-by: Ed Tomlinson Signed-off-by: Gustavo F. Padovan Tested-by: Justin P. Mattock Tested-by: Mikko Vinni Tested-by: Ed Tomlinson --- net/bluetooth/hci_core.c | 6 +++++- net/bluetooth/hci_event.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b372fb8bcdcf..92b48e257b89 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -186,6 +186,7 @@ static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) BT_DBG("%s %ld", hdev->name, opt); /* Reset device */ + set_bit(HCI_RESET, &hdev->flags); hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL); } @@ -213,8 +214,10 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) /* Mandatory initialization */ /* Reset */ - if (!test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) + if (!test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) { + set_bit(HCI_RESET, &hdev->flags); hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL); + } /* Read Local Supported Features */ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); @@ -1074,6 +1077,7 @@ static void hci_cmd_timer(unsigned long arg) BT_ERR("%s command tx timeout", hdev->name); atomic_set(&hdev->cmd_cnt, 1); + clear_bit(HCI_RESET, &hdev->flags); tasklet_schedule(&hdev->cmd_task); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 3fbfa50c2bff..cebe7588469f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -183,6 +183,8 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%x", hdev->name, status); + clear_bit(HCI_RESET, &hdev->flags); + hci_req_complete(hdev, HCI_OP_RESET, status); } @@ -1847,7 +1849,7 @@ static inline void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb) if (ev->opcode != HCI_OP_NOP) del_timer(&hdev->cmd_timer); - if (ev->ncmd) { + if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) { atomic_set(&hdev->cmd_cnt, 1); if (!skb_queue_empty(&hdev->cmd_q)) tasklet_schedule(&hdev->cmd_task); -- cgit From 6994ca5e8ade57d18b7d1e05aad040c441a2ad37 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 16 Mar 2011 14:29:34 +0200 Subject: Bluetooth: Fix missing hci_dev_lock_bh in user_confirm_reply The code was correctly calling _unlock at the end of the function but there was no actual _lock call anywhere. Signed-off-by: Johan Hedberg Signed-off-by: Gustavo F. Padovan --- net/bluetooth/mgmt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 0054c74e27b7..4476d8e3c0f2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1230,6 +1230,8 @@ static int user_confirm_reply(struct sock *sk, u16 index, unsigned char *data, if (!hdev) return cmd_status(sk, index, mgmt_op, ENODEV); + hci_dev_lock_bh(hdev); + if (!test_bit(HCI_UP, &hdev->flags)) { err = cmd_status(sk, index, mgmt_op, ENETDOWN); goto failed; -- cgit From a0cc9a1b5712ea52aaa4e7abfa0ec2dbe0d820ff Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Thu, 24 Mar 2011 17:16:08 +0200 Subject: Bluetooth: delete hanging L2CAP channel Sometimes L2CAP connection remains hanging. Make sure that L2CAP channel is deleted. Signed-off-by: Andrei Emeltchenko Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap_sock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index fc85e7ae33c7..f77308e63e58 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -923,8 +923,9 @@ void __l2cap_sock_close(struct sock *sk, int reason) rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); - } else - l2cap_chan_del(sk, reason); + } + + l2cap_chan_del(sk, reason); break; case BT_CONNECT: -- cgit From b77dcf8460ae57d4eb9fd3633eb4f97b8fb20716 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 20:16:42 +0100 Subject: Bluetooth: Fix warning with hci_cmd_timer After we made debugobjects working again, we got the following: WARNING: at lib/debugobjects.c:262 debug_print_object+0x8e/0xb0() Hardware name: System Product Name ODEBUG: free active (active state 0) object type: timer_list hint: hci_cmd_timer+0x0/0x60 Pid: 2125, comm: dmsetup Tainted: G W 2.6.38-06707-gc62b389 #110375 Call Trace: [] warn_slowpath_common+0x7a/0xb0 [] warn_slowpath_fmt+0x46/0x50 [] debug_print_object+0x8e/0xb0 [] ? hci_cmd_timer+0x0/0x60 [] debug_check_no_obj_freed+0x125/0x230 [] ? check_object+0xb3/0x2b0 [] kfree+0x150/0x190 [] ? bt_host_release+0x16/0x20 [] bt_host_release+0x16/0x20 [] device_release+0x27/0xa0 [] kobject_release+0x4c/0xa0 [] ? kobject_release+0x0/0xa0 [] kref_put+0x36/0x70 [] kobject_put+0x27/0x60 [] put_device+0x17/0x20 [] hci_free_dev+0x29/0x30 [] vhci_release+0x36/0x70 [] fput+0xd6/0x1f0 [] filp_close+0x66/0x90 [] sys_close+0x99/0xf0 [] system_call_fastpath+0x16/0x1b That timer was introduced with commit 6bd32326cda(Bluetooth: Use proper timer for hci command timout) Timer seems to be running when the thing is closed. Removing the timer unconditionally fixes the problem. And yes, it needs to be fixed before the HCI_UP check. Signed-off-by: Thomas Gleixner Tested-by: Ingo Molnar Signed-off-by: Gustavo F. Padovan --- net/bluetooth/hci_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 92b48e257b89..2216620ff296 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -587,6 +587,9 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_req_cancel(hdev, ENODEV); hci_req_lock(hdev); + /* Stop timer, it might be running */ + del_timer_sync(&hdev->cmd_timer); + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { hci_req_unlock(hdev); return 0; @@ -626,7 +629,6 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Drop last sent command */ if (hdev->sent_cmd) { - del_timer_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } -- cgit From 436c3b66ec9824a633724ae42de1c416af4f2063 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 24 Mar 2011 17:42:21 -0700 Subject: ipv4: Invalidate nexthop cache nh_saddr more correctly. Any operation that: 1) Brings up an interface 2) Adds an IP address to an interface 3) Deletes an IP address from an interface can potentially invalidate the nh_saddr value, requiring it to be recomputed. Perform the recomputation lazily using a generation ID. Reported-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 11 +++++++---- net/ipv4/fib_semantics.c | 32 +++++++++++--------------------- net/ipv4/route.c | 6 ++++-- 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 02c3ba61884a..f116ce8f1b46 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -228,7 +228,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -258,7 +258,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } } @@ -960,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: @@ -967,12 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - fib_update_nh_saddrs(dev); + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); - fib_update_nh_saddrs(dev); + atomic_inc(&net->ipv4.dev_addr_genid); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -990,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); @@ -1007,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 75b9fb5d5d37..2d4bebca671a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -695,6 +695,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_cfg_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -855,9 +865,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { nexthop_nh->nh_cfg_scope = cfg->fc_scope; - nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev, - nexthop_nh->nh_gw, - nexthop_nh->nh_cfg_scope); + fib_info_update_nh_saddr(net, nexthop_nh); } endfor_nexthops(fi) link_it: @@ -1128,24 +1136,6 @@ out: return; } -void fib_update_nh_saddrs(struct net_device *dev) -{ - struct hlist_head *head; - struct hlist_node *node; - struct fib_nh *nh; - unsigned int hash; - - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { - if (nh->nh_dev != dev) - continue; - nh->nh_saddr = inet_select_addr(nh->nh_dev, - nh->nh_gw, - nh->nh_cfg_scope); - } -} - #ifdef CONFIG_IP_ROUTE_MULTIPATH /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34921b0d3f8e..4b0c81180804 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1718,7 +1718,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) - src = FIB_RES_PREFSRC(res); + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); @@ -2615,7 +2615,7 @@ static struct rtable *ip_route_output_slow(struct net *net, fib_select_default(&res); if (!fl4.saddr) - fl4.saddr = FIB_RES_PREFSRC(res); + fl4.saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); fl4.flowi4_oif = dev_out->ifindex; @@ -3219,6 +3219,8 @@ static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); return 0; } -- cgit From 37e826c513883099c298317bad1b3b677b2905fb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 24 Mar 2011 18:06:47 -0700 Subject: ipv4: Fix nexthop caching wrt. scoping. Move the scope value out of the fib alias entries and into fib_info, so that we always use the correct scope when recomputing the nexthop cached source address. Reported-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_lookup.h | 3 +-- net/ipv4/fib_semantics.c | 15 ++++++++------- net/ipv4/fib_trie.c | 12 ++++-------- 3 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 4ec323875a02..af0f14aba169 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias { struct fib_info *fa_info; u8 fa_tos; u8 fa_type; - u8 fa_scope; u8 fa_state; struct rcu_head rcu; }; @@ -29,7 +28,7 @@ extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2d4bebca671a..641a5a2a9f9c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -222,7 +222,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; - val ^= fi->fib_protocol; + val ^= (fi->fib_protocol << 8) | fi->fib_scope; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { @@ -248,6 +248,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, @@ -328,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, goto errout; err = fib_dump_info(skb, info->pid, seq, event, tb_id, - fa->fa_type, fa->fa_scope, key, dst_len, + fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -699,7 +700,7 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) { nh->nh_saddr = inet_select_addr(nh->nh_dev, nh->nh_gw, - nh->nh_cfg_scope); + nh->nh_parent->fib_scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; @@ -763,6 +764,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; @@ -864,7 +866,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } change_nexthops(fi) { - nexthop_nh->nh_cfg_scope = cfg->fc_scope; fib_info_update_nh_saddr(net, nexthop_nh); } endfor_nexthops(fi) @@ -914,7 +915,7 @@ failure: } int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; @@ -936,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; + rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len) @@ -1092,7 +1093,7 @@ void fib_select_default(struct fib_result *res) list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - if (fa->fa_scope != res->scope || + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ac87a49ad50b..90a3ff605591 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && fa->fa_info == fi) { fa_match = fa; break; @@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; /* * Insert new entry to the list. @@ -1362,7 +1359,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; - if (fa->fa_scope < flp->flowi4_scope) + if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; @@ -1388,7 +1385,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, res->prefixlen = plen; res->nh_sel = nhsel; res->type = fa->fa_type; - res->scope = fa->fa_scope; + res->scope = fa->fa_info->fib_scope; res->fi = fi; res->table = tb; res->fa_head = &li->falh; @@ -1664,7 +1661,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && + fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || @@ -1863,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, RTM_NEWROUTE, tb->tb_id, fa->fa_type, - fa->fa_scope, xkey, plen, fa->fa_tos, @@ -2384,7 +2380,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth+1); seq_printf(seq, " /%d %s %s", li->plen, rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), + fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_tos) -- cgit From 1fbc78439291627642517f15b9b91f3125588143 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 25 Mar 2011 20:33:23 -0700 Subject: ipv4: do not ignore route errors The "ipv4: Inline fib_semantic_match into check_leaf" change forgets to return the route errors. check_leaf should return the same results as fib_table_lookup. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 90a3ff605591..b92c86f6e9b3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1365,9 +1365,9 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, err = fib_props[fa->fa_type].error; if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; + t->stats.semantic_match_passed++; #endif - return 1; + return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; -- cgit From a271c5a0dea418931b6a903ef85adc30ad4c54be Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 27 Mar 2011 17:48:57 +0200 Subject: NFS: Ensure that rpc_release_resources_task() can be called twice. BUG: atomic_dec_and_test(): -1: atomic counter underflow at: Pid: 2827, comm: mount.nfs Not tainted 2.6.38 #1 Call Trace: [] ? put_rpccred+0x44/0x14e [sunrpc] [] ? rpc_ping+0x4e/0x58 [sunrpc] [] ? rpc_create+0x481/0x4fc [sunrpc] [] ? rpcauth_lookup_credcache+0xab/0x22d [sunrpc] [] ? nfs_create_rpc_client+0xa6/0xeb [nfs] [] ? nfs4_set_client+0xc2/0x1f9 [nfs] [] ? nfs4_create_server+0xf2/0x2a6 [nfs] [] ? nfs4_remote_mount+0x4e/0x14a [nfs] [] ? vfs_kern_mount+0x6e/0x133 [] ? nfs_do_root_mount+0x76/0x95 [nfs] [] ? nfs4_try_mount+0x56/0xaf [nfs] [] ? nfs_get_sb+0x435/0x73c [nfs] [] ? vfs_kern_mount+0x99/0x133 [] ? do_kern_mount+0x48/0xd8 [] ? do_mount+0x6da/0x741 [] ? sys_mount+0x83/0xc0 [] ? system_call_fastpath+0x16/0x1b Well, so, I think this is real bug of nfs codes somewhere. With some review, the code rpc_call_sync() rpc_run_task rpc_execute() __rpc_execute() rpc_release_task() rpc_release_resources_task() put_rpccred() <= release cred rpc_put_task rpc_do_put_task() rpc_release_resources_task() put_rpccred() <= release cred again seems to be release cred unintendedly. Signed-off-by: OGAWA Hirofumi Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index ffb687671da0..6b43ee7221d5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -860,8 +860,10 @@ static void rpc_release_resources_task(struct rpc_task *task) { if (task->tk_rqstp) xprt_release(task); - if (task->tk_msg.rpc_cred) + if (task->tk_msg.rpc_cred) { put_rpccred(task->tk_msg.rpc_cred); + task->tk_msg.rpc_cred = NULL; + } rpc_task_release_client(task); } -- cgit From 3bc07321ccc236f693ce1b6a8786f0a2e38bb87e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 15 Mar 2011 21:08:28 +0000 Subject: xfrm: Force a dst refcount before entering the xfrm type handlers Crypto requests might return asynchronous. In this case we leave the rcu protected region, so force a refcount on the skb's destination entry before we enter the xfrm type input/output handlers. This fixes a crash when a route is deleted whilst sending IPsec data that is transformed by an asynchronous algorithm. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_input.c | 2 ++ net/xfrm/xfrm_output.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 872065ca7f8c..341cd1189f8a 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -190,6 +190,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + skb_dst_force(skb); + nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 1aba03f449cc..8f3f0eedc5a4 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -78,6 +78,8 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); + skb_dst_force(skb); + err = x->type->output(x, skb); if (err == -EINPROGRESS) goto out_exit; -- cgit From e433430a0ca9cc1b851a83ac3b305e955b64880a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 15 Mar 2011 21:09:32 +0000 Subject: dst: Clone child entry in skb_dst_pop We clone the child entry in skb_dst_pop before we call skb_dst_drop(). Otherwise we might kill the child right before we return it to the caller. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 8f3f0eedc5a4..47bacd8c0250 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -96,7 +96,7 @@ resume: err = -EHOSTUNREACH; goto error_nolock; } - skb_dst_set(skb, dst_clone(dst)); + skb_dst_set(skb, dst); x = dst->xfrm; } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); -- cgit From d50e7e3604778bfc2dc40f440e0742dbae399d54 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Sat, 19 Mar 2011 20:14:30 +0000 Subject: irda: prevent heap corruption on invalid nickname Invalid nicknames containing only spaces will result in an underflow in a memcpy size calculation, subsequently destroying the heap and panicking. v2 also catches the case where the provided nickname is longer than the buffer size, which can result in controllable heap corruption. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: David S. Miller --- net/irda/irnet/irnet_ppp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 7c567b8aa89a..2bb2beb6a373 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -105,6 +105,9 @@ irnet_ctrl_write(irnet_socket * ap, while(isspace(start[length - 1])) length--; + DABORT(length < 5 || length > NICKNAME_MAX_LEN + 5, + -EINVAL, CTRL_ERROR, "Invalid nickname.\n"); + /* Copy the name for later reuse */ memcpy(ap->rname, start + 5, length - 5); ap->rname[length - 5] = '\0'; -- cgit From d370af0ef7951188daeb15bae75db7ba57c67846 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Sun, 20 Mar 2011 15:32:06 +0000 Subject: irda: validate peer name and attribute lengths Length fields provided by a peer for names and attributes may be longer than the destination array sizes. Validate lengths to prevent stack buffer overflows. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: David S. Miller --- net/irda/iriap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/irda/iriap.c b/net/irda/iriap.c index 5b743bdd89ba..36477538cea8 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -656,10 +656,16 @@ static void iriap_getvaluebyclass_indication(struct iriap_cb *self, n = 1; name_len = fp[n++]; + + IRDA_ASSERT(name_len < IAS_MAX_CLASSNAME + 1, return;); + memcpy(name, fp+n, name_len); n+=name_len; name[name_len] = '\0'; attr_len = fp[n++]; + + IRDA_ASSERT(attr_len < IAS_MAX_ATTRIBNAME + 1, return;); + memcpy(attr, fp+n, attr_len); n+=attr_len; attr[attr_len] = '\0'; -- cgit From be20250c13f88375345ad99950190685eda51eb8 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Sat, 19 Mar 2011 20:43:43 +0000 Subject: ROSE: prevent heap corruption with bad facilities When parsing the FAC_NATIONAL_DIGIS facilities field, it's possible for a remote host to provide more digipeaters than expected, resulting in heap corruption. Check against ROSE_MAX_DIGIS to prevent overflows, and abort facilities parsing on failure. Additionally, when parsing the FAC_CCITT_DEST_NSAP and FAC_CCITT_SRC_NSAP facilities fields, a remote host can provide a length of less than 10, resulting in an underflow in a memcpy size, causing a kernel panic due to massive heap corruption. A length of greater than 20 results in a stack overflow of the callsign array. Abort facilities parsing on these invalid length values. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Signed-off-by: David S. Miller --- net/rose/rose_subr.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index 1734abba26a2..174d51c9ce37 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -290,10 +290,15 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities_struct * facilities->source_ndigis = 0; facilities->dest_ndigis = 0; for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { - if (pt[6] & AX25_HBIT) + if (pt[6] & AX25_HBIT) { + if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) + return -1; memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); - else + } else { + if (facilities->source_ndigis >= ROSE_MAX_DIGIS) + return -1; memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); + } } } p += l + 2; @@ -333,6 +338,11 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac case 0xC0: l = p[1]; + + /* Prevent overflows*/ + if (l < 10 || l > 20) + return -1; + if (*p == FAC_CCITT_DEST_NSAP) { memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); @@ -373,12 +383,16 @@ int rose_parse_facilities(unsigned char *p, switch (*p) { case FAC_NATIONAL: /* National */ len = rose_parse_national(p + 1, facilities, facilities_len - 1); + if (len < 0) + return 0; facilities_len -= len + 1; p += len + 1; break; case FAC_CCITT: /* CCITT */ len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); + if (len < 0) + return 0; facilities_len -= len + 1; p += len + 1; break; -- cgit From e0bccd315db0c2f919e7fcf9cb60db21d9986f52 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 20 Mar 2011 06:48:05 +0000 Subject: rose: Add length checks to CALL_REQUEST parsing Define some constant offsets for CALL_REQUEST based on the description at and the definition of ROSE as using 10-digit (5-byte) addresses. Use them consistently. Validate all implicit and explicit facilities lengths. Validate the address length byte rather than either trusting or assuming its value. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/rose/af_rose.c | 8 ++--- net/rose/rose_loopback.c | 13 ++++++- net/rose/rose_route.c | 20 +++++++---- net/rose/rose_subr.c | 91 ++++++++++++++++++++++++++++++------------------ 4 files changed, 86 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5ee0c62046a0..a80aef6e3d1f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -978,7 +978,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros struct sock *make; struct rose_sock *make_rose; struct rose_facilities_struct facilities; - int n, len; + int n; skb->sk = NULL; /* Initially we don't know who it's for */ @@ -987,9 +987,9 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros */ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); - len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; - len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; - if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index ae4a9d99aec7..344456206b70 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -73,9 +73,20 @@ static void rose_loopback_timer(unsigned long param) unsigned int lci_i, lci_o; while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + if (skb->len < ROSE_MIN_LEN) { + kfree_skb(skb); + continue; + } lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); frametype = skb->data[2]; - dest = (rose_address *)(skb->data + 4); + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) { + kfree_skb(skb); + continue; + } + dest = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); lci_o = ROSE_DEFAULT_MAXVC + 1 - lci_i; skb_reset_transport_header(skb); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 88a77e90e7e8..08dcd2f29cdc 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -861,7 +861,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) unsigned int lci, new_lci; unsigned char cause, diagnostic; struct net_device *dev; - int len, res = 0; + int res = 0; char buf[11]; #if 0 @@ -869,10 +869,17 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) return res; #endif + if (skb->len < ROSE_MIN_LEN) + return res; frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); - src_addr = (rose_address *)(skb->data + 9); - dest_addr = (rose_address *)(skb->data + 4); + if (frametype == ROSE_CALL_REQUEST && + (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || + skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != + ROSE_CALL_REQ_ADDR_LEN_VAL)) + return res; + src_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF); + dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); spin_lock_bh(&rose_neigh_list_lock); spin_lock_bh(&rose_route_list_lock); @@ -1010,12 +1017,11 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) goto out; } - len = (((skb->data[3] >> 4) & 0x0F) + 1) >> 1; - len += (((skb->data[3] >> 0) & 0x0F) + 1) >> 1; - memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); - if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, + skb->len - ROSE_CALL_REQ_FACILITIES_OFF, + &facilities)) { rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); goto out; } diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index 174d51c9ce37..f6c71caa94b9 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -142,7 +142,7 @@ void rose_write_internal(struct sock *sk, int frametype) *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; - *dptr++ = 0xAA; + *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); @@ -246,12 +246,16 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities_struct * do { switch (*p & 0xC0) { case 0x00: + if (len < 2) + return -1; p += 2; n += 2; len -= 2; break; case 0x40: + if (len < 3) + return -1; if (*p == FAC_NATIONAL_RAND) facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); p += 3; @@ -260,32 +264,48 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities_struct * break; case 0x80: + if (len < 4) + return -1; p += 4; n += 4; len -= 4; break; case 0xC0: + if (len < 2) + return -1; l = p[1]; + if (len < 2 + l) + return -1; if (*p == FAC_NATIONAL_DEST_DIGI) { if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); facilities->source_ndigis = 1; } } else if (*p == FAC_NATIONAL_SRC_DIGI) { if (!fac_national_digis_received) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); facilities->dest_ndigis = 1; } } else if (*p == FAC_NATIONAL_FAIL_CALL) { + if (l < AX25_ADDR_LEN) + return -1; memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); } else if (*p == FAC_NATIONAL_FAIL_ADD) { + if (l < 1 + ROSE_ADDR_LEN) + return -1; memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } else if (*p == FAC_NATIONAL_DIGIS) { + if (l % AX25_ADDR_LEN) + return -1; fac_national_digis_received = 1; facilities->source_ndigis = 0; facilities->dest_ndigis = 0; @@ -319,24 +339,32 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac do { switch (*p & 0xC0) { case 0x00: + if (len < 2) + return -1; p += 2; n += 2; len -= 2; break; case 0x40: + if (len < 3) + return -1; p += 3; n += 3; len -= 3; break; case 0x80: + if (len < 4) + return -1; p += 4; n += 4; len -= 4; break; case 0xC0: + if (len < 2) + return -1; l = p[1]; /* Prevent overflows*/ @@ -365,49 +393,44 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *fac return n; } -int rose_parse_facilities(unsigned char *p, +int rose_parse_facilities(unsigned char *p, unsigned packet_len, struct rose_facilities_struct *facilities) { int facilities_len, len; facilities_len = *p++; - if (facilities_len == 0) + if (facilities_len == 0 || (unsigned)facilities_len > packet_len) return 0; - while (facilities_len > 0) { - if (*p == 0x00) { - facilities_len--; - p++; - - switch (*p) { - case FAC_NATIONAL: /* National */ - len = rose_parse_national(p + 1, facilities, facilities_len - 1); - if (len < 0) - return 0; - facilities_len -= len + 1; - p += len + 1; - break; - - case FAC_CCITT: /* CCITT */ - len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); - if (len < 0) - return 0; - facilities_len -= len + 1; - p += len + 1; - break; - - default: - printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); - facilities_len--; - p++; - break; - } - } else - break; /* Error in facilities format */ + while (facilities_len >= 3 && *p == 0x00) { + facilities_len--; + p++; + + switch (*p) { + case FAC_NATIONAL: /* National */ + len = rose_parse_national(p + 1, facilities, facilities_len - 1); + break; + + case FAC_CCITT: /* CCITT */ + len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); + break; + + default: + printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); + len = 1; + break; + } + + if (len < 0) + return 0; + if (WARN_ON(len >= facilities_len)) + return 0; + facilities_len -= len + 1; + p += len + 1; } - return 1; + return facilities_len == 0; } static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) -- cgit From 3b261ade4224852ed841ecfd13876db812846e96 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 22 Mar 2011 01:59:47 +0000 Subject: net: remove useless comments in net/core/dev.c The code itself can explain what it is doing, no need these comments. Signed-off-by: WANG Cong Signed-off-by: David S. Miller --- net/core/dev.c | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f453370131a0..9b23dcefae6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1140,9 +1140,6 @@ static int __dev_open(struct net_device *dev) ASSERT_RTNL(); - /* - * Is it even present? - */ if (!netif_device_present(dev)) return -ENODEV; @@ -1151,9 +1148,6 @@ static int __dev_open(struct net_device *dev) if (ret) return ret; - /* - * Call device private open method - */ set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) @@ -1162,31 +1156,12 @@ static int __dev_open(struct net_device *dev) if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); - /* - * If it went open OK then: - */ - if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - /* - * Set the flags. - */ dev->flags |= IFF_UP; - - /* - * Enable NET_DMA - */ net_dmaengine_get(); - - /* - * Initialize multicasting status - */ dev_set_rx_mode(dev); - - /* - * Wakeup transmit queue engine - */ dev_activate(dev); } @@ -1209,22 +1184,13 @@ int dev_open(struct net_device *dev) { int ret; - /* - * Is it already up? - */ if (dev->flags & IFF_UP) return 0; - /* - * Open device - */ ret = __dev_open(dev); if (ret < 0) return ret; - /* - * ... and announce new interface. - */ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_UP, dev); @@ -1240,10 +1206,6 @@ static int __dev_close_many(struct list_head *head) might_sleep(); list_for_each_entry(dev, head, unreg_list) { - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); @@ -1272,15 +1234,7 @@ static int __dev_close_many(struct list_head *head) if (ops->ndo_stop) ops->ndo_stop(dev); - /* - * Device is now down. - */ - dev->flags &= ~IFF_UP; - - /* - * Shutdown NET_DMA - */ net_dmaengine_put(); } @@ -1309,9 +1263,6 @@ static int dev_close_many(struct list_head *head) __dev_close_many(head); - /* - * Tell people we are down - */ list_for_each_entry(dev, head, unreg_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); call_netdevice_notifiers(NETDEV_DOWN, dev); @@ -1371,11 +1322,6 @@ EXPORT_SYMBOL(dev_disable_lro); static int dev_boot_phase = 1; -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - /** * register_netdevice_notifier - register a network notifier block * @nb: notifier -- cgit From 53914b67993c724cec585863755c9ebc8446e83b Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 22 Mar 2011 08:27:25 +0000 Subject: can: make struct proto const can_ioctl is the only reason for struct proto to be non-const. script/check-patch.pl suggests struct proto be const. Setting the reference to the common can_ioctl() in all CAN protocols directly removes the need to make the struct proto writable in af_can.c Signed-off-by: Kurt Van Dijck Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/af_can.c | 9 +++------ net/can/bcm.c | 4 ++-- net/can/raw.c | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 702be5a2c956..733d66f1b05a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -95,7 +95,7 @@ struct s_pstats can_pstats; /* receive list statistics */ * af_can socket functions */ -static int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -108,6 +108,7 @@ static int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOIOCTLCMD; } } +EXPORT_SYMBOL(can_ioctl); static void can_sock_destruct(struct sock *sk) { @@ -698,13 +699,9 @@ int can_proto_register(struct can_proto *cp) printk(KERN_ERR "can: protocol %d already registered\n", proto); err = -EBUSY; - } else { + } else proto_tab[proto] = cp; - /* use generic ioctl function if not defined by module */ - if (!cp->ops->ioctl) - cp->ops->ioctl = can_ioctl; - } spin_unlock(&proto_tab_lock); if (err < 0) diff --git a/net/can/bcm.c b/net/can/bcm.c index 092dc88a7c64..871a0ad51025 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1569,7 +1569,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock, return size; } -static struct proto_ops bcm_ops __read_mostly = { +static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, @@ -1578,7 +1578,7 @@ static struct proto_ops bcm_ops __read_mostly = { .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, - .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/can/raw.c b/net/can/raw.c index 883e9d74fddf..649acfa7c70a 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -742,7 +742,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, return size; } -static struct proto_ops raw_ops __read_mostly = { +static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, .bind = raw_bind, @@ -751,7 +751,7 @@ static struct proto_ops raw_ops __read_mostly = { .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, - .ioctl = NULL, /* use can_ioctl() from af_can.c */ + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, -- cgit From 8628bd8af7c4c14f40f5183f80f5744c4e682439 Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Thu, 24 Mar 2011 07:44:22 +0000 Subject: ipv4: Fix IP timestamp option (IPOPT_TS_PRESPEC) handling in ip_options_echo() The current handling of echoed IP timestamp options with prespecified addresses is rather broken since the 2.2.x kernels. As far as i understand it, it should behave like when originating packets. Currently it will only timestamp the next free slot if: - there is space for *two* timestamps - some random data from the echoed packet taken as an IP is *not* a local IP This first is caused by an off-by-one error. 'soffset' points to the next free slot and so we only need to have 'soffset + 7 <= optlen'. The second bug is using sptr as the start of the option, when it really is set to 'skb_network_header(skb)'. I just use dptr instead which points to the timestamp option. Finally it would only timestamp for non-local IPs, which we shouldn't do. So instead we exclude all unicast destinations, similar to what we do in ip_options_compile(). Signed-off-by: Jan Luebbe Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1906fa35860c..28a736f3442f 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -140,11 +140,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) } else { dopt->ts_needtime = 0; - if (soffset + 8 <= optlen) { + if (soffset + 7 <= optlen) { __be32 addr; - memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { + memcpy(&addr, dptr+soffset-1, 4); + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } -- cgit From edf947f10074fea27fdb1730524dca59355a1c40 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 24 Mar 2011 13:24:01 +0000 Subject: bridge: notify applications if address of bridge device changes The mac address of the bridge device may be changed when a new interface is added to the bridge. If this happens, then the bridge needs to call the network notifiers to tickle any other systems that care. Since bridge can be a module, this also means exporting the notifier function. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_if.c | 6 +++++- net/bridge/br_private.h | 2 +- net/bridge/br_stp_if.c | 9 ++++++--- net/core/dev.c | 1 + 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index dce8f0009a12..718b60366dfe 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -389,6 +389,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + bool changed_addr; /* Don't allow bridging non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || @@ -446,7 +447,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) list_add_rcu(&p->list, &br->port_list); spin_lock_bh(&br->lock); - br_stp_recalculate_bridge_id(br); + changed_addr = br_stp_recalculate_bridge_id(br); br_features_recompute(br); if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && @@ -456,6 +457,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) br_ifinfo_notify(RTM_NEWLINK, p); + if (changed_addr) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + dev_set_mtu(br->dev, br_min_mtu(br)); kobject_uevent(&p->kobj, KOBJ_ADD); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 19e2f46ed086..387013d33745 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -497,7 +497,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_set_enabled(struct net_bridge *br, unsigned long val); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); -extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern bool br_stp_recalculate_bridge_id(struct net_bridge *br); extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 79372d4a4055..5593f5aec942 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -204,7 +204,7 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ -void br_stp_recalculate_bridge_id(struct net_bridge *br) +bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; @@ -222,8 +222,11 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) } - if (compare_ether_addr(br->bridge_id.addr, addr)) - br_stp_change_bridge_id(br, addr); + if (compare_ether_addr(br->bridge_id.addr, addr) == 0) + return false; /* no change */ + + br_stp_change_bridge_id(br, addr); + return true; } /* called under bridge lock */ diff --git a/net/core/dev.c b/net/core/dev.c index 9b23dcefae6c..563ddc28139d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1423,6 +1423,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } +EXPORT_SYMBOL(call_netdevice_notifiers); /* When > 0 there are consumers of rx skb time stamps */ static atomic_t netstamp_needed = ATOMIC_INIT(0); -- cgit From 3e49e6d520401e1d25ec8d366520aad2c01adc1c Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Sat, 26 Mar 2011 05:10:30 +0000 Subject: net: use CHECKSUM_NONE instead of magic number Two places in the kernel were doing skb->ip_summed = 0. Change both to skb->ip_summed = CHECKSUM_NONE, which is more readable. Signed-off-by: Cesar Eduardo Barros Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7ff0343e05c7..29e48593bf22 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -663,7 +663,7 @@ static int pim6_rcv(struct sk_buff *skb) skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); - skb->ip_summed = 0; + skb->ip_summed = CHECKSUM_NONE; skb->pkt_type = PACKET_HOST; skb_tunnel_rx(skb, reg_dev); -- cgit From 673e63c688f43104c73aad8ea4237f7ad41fa14d Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 22 Mar 2011 23:54:49 +0000 Subject: net: fix ethtool->set_flags not intended -EINVAL return value After commit d5dbda23804156ae6f35025ade5307a49d1db6d7 "ethtool: Add support for vlan accleration.", drivers that have NETIF_F_HW_VLAN_TX, and/or NETIF_F_HW_VLAN_RX feature, but do not allow enable/disable vlan acceleration via ethtool set_flags, always return -EINVAL from that function. Fix by returning -EINVAL only if requested features do not match current settings and can not be changed by driver. Change any driver that define ethtool->set_flags to use ethtool_invalid_flags() to avoid similar problems in the future (also on drivers that do not have the problem). Tested with modified (to reproduce this bug) myri10ge driver. Cc: stable@kernel.org # 2.6.37+ Signed-off-by: Stanislaw Gruszka Signed-off-by: David S. Miller --- net/core/ethtool.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 24bd57493c0d..74ead9eca126 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -141,9 +141,24 @@ u32 ethtool_op_get_flags(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_flags); +/* Check if device can enable (or disable) particular feature coded in "data" + * argument. Flags "supported" describe features that can be toggled by device. + * If feature can not be toggled, it state (enabled or disabled) must match + * hardcoded device features state, otherwise flags are marked as invalid. + */ +bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) +{ + u32 features = dev->features & flags_dup_features; + /* "data" can contain only flags_dup_features bits, + * see __ethtool_set_flags */ + + return (features & ~supported) != (data & ~supported); +} +EXPORT_SYMBOL(ethtool_invalid_flags); + int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) { - if (data & ~supported) + if (ethtool_invalid_flags(dev, data, supported)) return -EINVAL; dev->features = ((dev->features & ~flags_dup_features) | -- cgit From 4dc217df68a17a57f8464c74c1b4785e40bddf77 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 25 Mar 2011 15:30:38 +0100 Subject: mac80211: fix a crash in minstrel_ht in HT mode with no supported MCS rates When a client connects in HT mode but does not provide any valid MCS rates, the function that finds the next sample rate gets stuck in an infinite loop. Fix this by falling back to legacy rates if no usable MCS rates are found. Signed-off-by: Felix Fietkau Cc: stable@kernel.org Signed-off-by: John W. Linville --- net/mac80211/rc80211_minstrel_ht.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 8212a8bebf06..dbdebeda097f 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -659,18 +659,14 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; struct ieee80211_local *local = hw_to_local(mp->hw); u16 sta_cap = sta->ht_cap.cap; + int n_supported = 0; int ack_dur; int stbc; int i; /* fall back to the old minstrel for legacy stations */ - if (!sta->ht_cap.ht_supported) { - msp->is_ht = false; - memset(&msp->legacy, 0, sizeof(msp->legacy)); - msp->legacy.r = msp->ratelist; - msp->legacy.sample_table = msp->sample_table; - return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy); - } + if (!sta->ht_cap.ht_supported) + goto use_legacy; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS); @@ -725,7 +721,22 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, mi->groups[i].supported = mcs->rx_mask[minstrel_mcs_groups[i].streams - 1]; + + if (mi->groups[i].supported) + n_supported++; } + + if (!n_supported) + goto use_legacy; + + return; + +use_legacy: + msp->is_ht = false; + memset(&msp->legacy, 0, sizeof(msp->legacy)); + msp->legacy.r = msp->ratelist; + msp->legacy.sample_table = msp->sample_table; + return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy); } static void -- cgit From 1f951a7f8ba05192291f781ef99a892697e47d62 Mon Sep 17 00:00:00 2001 From: Petr Štetiar Date: Sun, 27 Mar 2011 13:31:26 +0200 Subject: mac80211: fix NULL pointer dereference in ieee80211_key_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ieee80211_key struct can be kfree()d several times in the function, for example if some of the key setup functions fails beforehand, but there's no check if the struct is still valid before we call memcpy() and INIT_LIST_HEAD() on it. In some cases (like it was in my case), if there's missing aes-generic module it could lead to the following kernel OOPS: Unable to handle kernel NULL pointer dereference at virtual address 0000018c .... PC is at memcpy+0x80/0x29c ... Backtrace: [] (ieee80211_key_alloc+0x0/0x234 [mac80211]) from [] (ieee80211_add_key+0x70/0x12c [mac80211]) [] (ieee80211_add_key+0x0/0x12c [mac80211]) from [] (__cfg80211_set_encryption+0x2a8/0x464 [cfg80211]) Signed-off-by: Petr Štetiar Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 8c02469b7176..09cf1f28c12b 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -342,7 +342,7 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, if (IS_ERR(key->u.ccmp.tfm)) { err = PTR_ERR(key->u.ccmp.tfm); kfree(key); - key = ERR_PTR(err); + return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_AES_CMAC: @@ -360,7 +360,7 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, if (IS_ERR(key->u.aes_cmac.tfm)) { err = PTR_ERR(key->u.aes_cmac.tfm); kfree(key); - key = ERR_PTR(err); + return ERR_PTR(err); } break; } -- cgit From 67aa030c0dff6095128bcb4e8043b48360f32331 Mon Sep 17 00:00:00 2001 From: Mariusz Kozlowski Date: Sat, 26 Mar 2011 18:58:51 +0100 Subject: mac80211: fix possible NULL pointer dereference This patch moves 'key' dereference after BUG_ON(!key) so that when key is NULL we will see proper trace instead of oops. Signed-off-by: Mariusz Kozlowski Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/key.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 09cf1f28c12b..af3c56482c80 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -400,11 +400,12 @@ int ieee80211_key_link(struct ieee80211_key *key, { struct ieee80211_key *old_key; int idx, ret; - bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; + bool pairwise; BUG_ON(!sdata); BUG_ON(!key); + pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; idx = key->conf.keyidx; key->local = sdata->local; key->sdata = sdata; -- cgit From bef9bacc4ec7ea6a02876164cd6ccaa4759edce4 Mon Sep 17 00:00:00 2001 From: Mariusz Kozlowski Date: Sat, 26 Mar 2011 19:26:55 +0100 Subject: cfg80211:: fix possible NULL pointer dereference In cfg80211_inform_bss_frame() wiphy is first dereferenced on privsz initialisation and then it is checked for NULL. This patch fixes that. Signed-off-by: Mariusz Kozlowski Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/scan.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ea427f418f64..300c11d99997 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -585,16 +585,23 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, struct cfg80211_internal_bss *res; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); - size_t privsz = wiphy->bss_priv_size; + size_t privsz; + + if (WARN_ON(!mgmt)) + return NULL; + + if (WARN_ON(!wiphy)) + return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && (signal < 0 || signal > 100))) return NULL; - if (WARN_ON(!mgmt || !wiphy || - len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable))) + if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable))) return NULL; + privsz = wiphy->bss_priv_size; + res = kzalloc(sizeof(*res) + privsz + ielen, gfp); if (!res) return NULL; -- cgit From 2b78ac9bfc7483ba4bda9ad3d10dd4afcf88337c Mon Sep 17 00:00:00 2001 From: Juuso Oikarinen Date: Mon, 28 Mar 2011 14:32:32 +0300 Subject: cfg80211: fix BSS double-unlinking (continued) This patch adds to the fix "fix BSS double-unlinking" (commit 3207390a8b58bfc1335750f91cf6783c48ca19ca) by Johannes Berg. It turns out, that the double-unlinking scenario can also occur if expired BSS elements are removed whilst an interface is performing association. To work around that, replace list_del with list_del_init also in the "cfg80211_bss_expire" function, so that the check for whether the BSS still is in the list works correctly in cfg80211_unlink_bss. Signed-off-by: Juuso Oikarinen Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/scan.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 300c11d99997..fbf6f33ae4d0 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -123,6 +123,15 @@ void cfg80211_bss_age(struct cfg80211_registered_device *dev, } } +/* must hold dev->bss_lock! */ +static void __cfg80211_unlink_bss(struct cfg80211_registered_device *dev, + struct cfg80211_internal_bss *bss) +{ + list_del_init(&bss->list); + rb_erase(&bss->rbn, &dev->bss_tree); + kref_put(&bss->ref, bss_release); +} + /* must hold dev->bss_lock! */ void cfg80211_bss_expire(struct cfg80211_registered_device *dev) { @@ -134,9 +143,7 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *dev) continue; if (!time_after(jiffies, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE)) continue; - list_del(&bss->list); - rb_erase(&bss->rbn, &dev->bss_tree); - kref_put(&bss->ref, bss_release); + __cfg80211_unlink_bss(dev, bss); expired = true; } @@ -669,11 +676,8 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) spin_lock_bh(&dev->bss_lock); if (!list_empty(&bss->list)) { - list_del_init(&bss->list); + __cfg80211_unlink_bss(dev, bss); dev->bss_generation++; - rb_erase(&bss->rbn, &dev->bss_tree); - - kref_put(&bss->ref, bss_release); } spin_unlock_bh(&dev->bss_lock); } -- cgit From 499fe9a419d43410be576bcc825658997b6ce822 Mon Sep 17 00:00:00 2001 From: Daniel Halperin Date: Thu, 24 Mar 2011 16:01:48 -0700 Subject: mac80211: fix aggregation frame release during timeout Suppose the aggregation reorder buffer looks like this: x-T-R1-y-R2, where x and y are frames that have not been received, T is a received frame that has timed out, and R1,R2 are received frames that have not yet timed out. The proper behavior in this scenario is to move the window past x (skipping it), release T and R1, and leave the window at y until y is received or R2 times out. As written, this code will instead leave the window at R1, because it has not yet timed out. Fix this by exiting the reorder loop only when the frame that has not timed out AND there are skipped frames earlier in the current valid window. Signed-off-by: Daniel Halperin Signed-off-by: John W. Linville --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5c1930ba8ebe..aa5cc37b4921 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -612,7 +612,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, skipped++; continue; } - if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + + if (skipped && + !time_after(jiffies, tid_agg_rx->reorder_time[j] + HT_RX_REORDER_BUF_TIMEOUT)) goto set_release_timer; -- cgit From 4910ac6c526d2868adcb5893e0c428473de862b5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 28 Mar 2011 16:51:15 -0700 Subject: ipv4: Don't ip_rt_put() an error pointer in RAW sockets. Reported-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- net/ipv4/raw.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e837ffd3edc3..2d3c72e5bbbf 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -569,6 +569,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); + rt = NULL; goto done; } } -- cgit From 36ae0148dbb6b9e15d8f067bb7523fd2b765a6af Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 28 Mar 2011 19:45:52 +0000 Subject: xfrm: Move the test on replay window size into the replay check functions As it is, the replay check is just performed if the replay window of the legacy implementation is nonzero. So we move the test on a nonzero replay window inside the replay check functions to be sure we are testing for the right implementation. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_replay.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 341cd1189f8a..a026b0ef2443 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -173,7 +173,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop_unlock; } - if (x->props.replay_window && x->repl->check(x, skb, seq)) { + if (x->repl->check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 2f5be5b15740..f218385950ca 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -118,6 +118,9 @@ static int xfrm_replay_check(struct xfrm_state *x, u32 diff; u32 seq = ntohl(net_seq); + if (!x->props.replay_window) + return 0; + if (unlikely(seq == 0)) goto err; @@ -193,9 +196,14 @@ static int xfrm_replay_check_bmp(struct xfrm_state *x, { unsigned int bitnr, nr; struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; u32 seq = ntohl(net_seq); u32 diff = replay_esn->seq - seq; - u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window; + + if (!replay_esn->replay_window) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; if (unlikely(seq == 0)) goto err; @@ -373,12 +381,17 @@ static int xfrm_replay_check_esn(struct xfrm_state *x, unsigned int bitnr, nr; u32 diff; struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + u32 pos; u32 seq = ntohl(net_seq); - u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window; u32 wsize = replay_esn->replay_window; u32 top = replay_esn->seq; u32 bottom = top - wsize + 1; + if (!wsize) + return 0; + + pos = (replay_esn->seq - 1) % replay_esn->replay_window; + if (unlikely(seq == 0 && replay_esn->seq_hi == 0 && (replay_esn->seq < replay_esn->replay_window - 1))) goto err; -- cgit From af2f464e326ebad57284cfdecb03f1606e89bbc7 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 28 Mar 2011 19:46:39 +0000 Subject: xfrm: Assign esn pointers when cloning a state When we clone a xfrm state we have to assign the replay_esn and the preplay_esn pointers to the state if we use the new replay detection method. To this end, we add a xfrm_replay_clone() function that allocates memory for the replay detection and takes over the necessary values from the original state. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f83a3d1da81b..dd78536d40de 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1181,6 +1181,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp) goto error; } + if (orig->replay_esn) { + err = xfrm_replay_clone(x, orig); + if (err) + goto error; + } + memcpy(&x->mark, &orig->mark, sizeof(x->mark)); err = xfrm_init_state(x); -- cgit From e2b19125e94124daaeda1ddcf9b85b04575ad86f Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 28 Mar 2011 19:47:30 +0000 Subject: xfrm: Check for esn buffer len in xfrm_new_ae In xfrm_new_ae() we may overwrite the allocated esn replay state buffer with a wrong size. So check that the new size matches the original allocated size and return an error if this is not the case. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index fc152d28753c..ccc4c0c8ef00 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -360,6 +360,23 @@ static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props, return 0; } +static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn, + struct nlattr *rp) +{ + struct xfrm_replay_state_esn *up; + + if (!replay_esn || !rp) + return 0; + + up = nla_data(rp); + + if (xfrm_replay_state_esn_len(replay_esn) != + xfrm_replay_state_esn_len(up)) + return -EINVAL; + + return 0; +} + static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn, struct xfrm_replay_state_esn **preplay_esn, struct nlattr *rta) @@ -1766,6 +1783,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, if (x->km.state != XFRM_STATE_VALID) goto out; + err = xfrm_replay_verify_len(x->replay_esn, rp); + if (err) + goto out; + spin_lock_bh(&x->lock); xfrm_update_ae_params(x, attrs); spin_unlock_bh(&x->lock); -- cgit From 02aadf72fe2c83f145e3437734e66be53abae481 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 28 Mar 2011 19:48:09 +0000 Subject: xfrm: Restrict extended sequence numbers to esp The IPsec extended sequence numbers are fully implemented just for esp. So restrict the usage to esp until other protocols have support too. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ccc4c0c8ef00..3d15d3e1b2c4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -127,6 +127,9 @@ static inline int verify_replay(struct xfrm_usersa_info *p, if (!rt) return 0; + if (p->id.proto != IPPROTO_ESP) + return -EINVAL; + if (p->replay_window != 0) return -EINVAL; -- cgit From 1459a3cc51d90d78027c7b5c1790e5d22751c8eb Mon Sep 17 00:00:00 2001 From: Balaji G Date: Tue, 29 Mar 2011 06:20:04 +0000 Subject: bridge: Fix compilation warning in function br_stp_recalculate_bridge_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/bridge/br_stp_if.c: In function ‘br_stp_recalculate_bridge_id’: net/bridge/br_stp_if.c:216:3: warning: ‘return’ with no value, in function returning non-void Signed-off-by: G.Balaji Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 5593f5aec942..9b61d09de9b9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -213,7 +213,7 @@ bool br_stp_recalculate_bridge_id(struct net_bridge *br) /* user has chosen a value so keep it */ if (br->flags & BR_SET_MAC_ADDR) - return; + return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || -- cgit From 93ca3bb5df9bc8b2c60485e1cc6507c3d7c8e1fa Mon Sep 17 00:00:00 2001 From: Timo Teräs Date: Mon, 28 Mar 2011 22:40:53 +0000 Subject: net: gre: provide multicast mappings for ipv4 and ipv6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My commit 6d55cb91a0020ac0 (gre: fix hard header destination address checking) broke multicast. The reason is that ip_gre used to get ipgre_header() calls with zero destination if we have NOARP or multicast destination. Instead the actual target was decided at ipgre_tunnel_xmit() time based on per-protocol dissection. Instead of allowing the "abuse" of ->header() calls with invalid destination, this creates multicast mappings for ip_gre. This also fixes "ip neigh show nud noarp" to display the proper multicast mappings used by the gre device. Reported-by: Doug Kehn Signed-off-by: Timo Teräs Acked-by: Doug Kehn Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +++ net/ipv6/ndisc.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 090d273d7865..1b74d3b64371 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -215,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; + case ARPHRD_IPGRE: + ip_ipgre_mc_map(addr, dev->broadcast, haddr); + return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0e49c9db3c98..92f952d093db 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -341,6 +341,8 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); -- cgit From ff9a57a62afbbe2d0f3a09af321f1fd7645f38a5 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sat, 26 Mar 2011 20:27:24 +0000 Subject: bridge: mcast snooping, fix length check of snooped MLDv1/2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "len = ntohs(ip6h->payload_len)" does not include the length of the ipv6 header itself, which the rest of this function assumes, though. This leads to a length check less restrictive as it should be in the following line for one thing. For another, it very likely leads to an integer underrun when substracting the offset and therefore to a very high new value of 'len' due to its unsignedness. This will ultimately lead to the pskb_trim_rcsum() practically never being called, even in the cases where it should. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f61eb2eff3fd..59660c909a7c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1475,7 +1475,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, ip6h->payload_len == 0) return 0; - len = ntohs(ip6h->payload_len); + len = ntohs(ip6h->payload_len) + sizeof(*ip6h); if (skb->len < len) return -EINVAL; -- cgit From 79b569f0ec53a14c4d71e79d93a8676d9a0fda6d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 30 Mar 2011 02:42:17 -0700 Subject: netdev: fix mtu check when TSO is enabled In case the device where is coming from the packet has TSO enabled, we should not check the mtu size value as this one could be bigger than the expected value. This is the case for the macvlan driver when the lower device has TSO enabled. The macvlan inherit this feature and forward the packets without fragmenting them. Then the packets go through dev_forward_skb and are dropped. This patch fix this by checking TSO is not enabled when we want to check the mtu size. Signed-off-by: Daniel Lezcano Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 563ddc28139d..3da9fb06d47a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1454,6 +1454,27 @@ static inline void net_timestamp_check(struct sk_buff *skb) __net_timestamp(skb); } +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + /** * dev_forward_skb - loopback an skb to another netif * @@ -1477,8 +1498,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (unlikely(!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { + if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; -- cgit From c031235b395433350f25943b7580a5e343c7b7b2 Mon Sep 17 00:00:00 2001 From: "Philip A. Prindeville" Date: Wed, 30 Mar 2011 13:17:04 +0000 Subject: atm/solos-pci: Don't flap VCs when carrier state changes Don't flap VCs when carrier state changes; higher-level protocols can detect loss of connectivity and act accordingly. This is more consistent with how other network interfaces work. We no longer use release_vccs() so we can delete it. release_vccs() was duplicated from net/atm/common.c; make the corresponding function exported, since other code duplicates it and could leverage it if it were public. Signed-off-by: Philip A. Prindeville Signed-off-by: David S. Miller --- net/atm/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index 1b9c52a02cd3..22b963d06a10 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -252,6 +252,7 @@ void atm_dev_release_vccs(struct atm_dev *dev) } write_unlock_irq(&vcc_sklist_lock); } +EXPORT_SYMBOL(atm_dev_release_vccs); static int adjust_tp(struct atm_trafprm *tp, unsigned char aal) { -- cgit From e2666f84958adb3a034b98e99699b55705117e01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Mar 2011 16:57:46 -0700 Subject: fib: add rtnl locking in ip_fib_net_exit Daniel J Blueman reported a lockdep splat in trie_firstleaf(), caused by RTNL being not locked before a call to fib_table_flush() Reported-by: Daniel J Blueman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f116ce8f1b46..451088330bbb 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1068,6 +1068,7 @@ static void ip_fib_net_exit(struct net *net) fib4_rules_exit(net); #endif + rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; @@ -1080,6 +1081,7 @@ static void ip_fib_net_exit(struct net *net) fib_free_table(tb); } } + rtnl_unlock(); kfree(net->ipv4.fib_table_hash); } -- cgit From a84b50ceb7d640437d0dc28a2bef0d0de054de89 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 30 Mar 2011 17:51:36 -0700 Subject: sctp: Pass __GFP_NOWARN to hash table allocation attempts. Like DCCP and other similar pieces of code, there are mechanisms here to try allocating smaller hash tables if the allocation fails. So pass in __GFP_NOWARN like the others do instead of emitting a scary message. Reported-by: Dave Jones Signed-off-by: David S. Miller --- net/sctp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 152976ec0b74..d5bf91d04f63 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1205,7 +1205,7 @@ SCTP_STATIC __init int sctp_init(void) if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0) continue; sctp_assoc_hashtable = (struct sctp_hashbucket *) - __get_free_pages(GFP_ATOMIC, order); + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); } while (!sctp_assoc_hashtable && --order > 0); if (!sctp_assoc_hashtable) { pr_err("Failed association hash alloc\n"); @@ -1238,7 +1238,7 @@ SCTP_STATIC __init int sctp_init(void) if ((sctp_port_hashsize > (64 * 1024)) && order > 0) continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC, order); + __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); -- cgit From c100c8f4c3c6f2a407bdbaaad2c4f1062e6a473a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 31 Mar 2011 18:59:10 -0700 Subject: appletalk: Fix OOPS in atalk_release(). Commit 60d9f461a20ba59219fdcdc30cbf8e3a4ad3f625 ("appletalk: remove the BKL") added a dereference of "sk" before checking for NULL in atalk_release(). Guard the code block completely, rather than partially, with the NULL check. Reported-by: Dave Jones Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 206e771e82d1..956a5302002a 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1051,16 +1051,17 @@ static int atalk_release(struct socket *sock) { struct sock *sk = sock->sk; - sock_hold(sk); - lock_sock(sk); if (sk) { + sock_hold(sk); + lock_sock(sk); + sock_orphan(sk); sock->sk = NULL; atalk_destroy_socket(sk); - } - release_sock(sk); - sock_put(sk); + release_sock(sk); + sock_put(sk); + } return 0; } -- cgit From 512d06b5b64fb422d90f199b1be188082729edf9 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 4 Apr 2011 15:18:45 +0200 Subject: netfilter: ipset: list:set timeout variant fixes - the timeout value was actually not set - the garbage collector was broken The variant is fixed, the tests to the ipset testsuite are added. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- net/netfilter/ipset/ip_set_list_set.c | 53 ++++++++++++++++------------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index a47c32982f06..f4a46c0d25f3 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -43,14 +43,19 @@ struct list_set { static inline struct set_elem * list_set_elem(const struct list_set *map, u32 id) { - return (struct set_elem *)((char *)map->members + id * map->dsize); + return (struct set_elem *)((void *)map->members + id * map->dsize); +} + +static inline struct set_telem * +list_set_telem(const struct list_set *map, u32 id) +{ + return (struct set_telem *)((void *)map->members + id * map->dsize); } static inline bool list_set_timeout(const struct list_set *map, u32 id) { - const struct set_telem *elem = - (const struct set_telem *) list_set_elem(map, id); + const struct set_telem *elem = list_set_telem(map, id); return ip_set_timeout_test(elem->timeout); } @@ -58,19 +63,11 @@ list_set_timeout(const struct list_set *map, u32 id) static inline bool list_set_expired(const struct list_set *map, u32 id) { - const struct set_telem *elem = - (const struct set_telem *) list_set_elem(map, id); + const struct set_telem *elem = list_set_telem(map, id); return ip_set_timeout_expired(elem->timeout); } -static inline int -list_set_exist(const struct set_telem *elem) -{ - return elem->id != IPSET_INVALID_ID && - !ip_set_timeout_expired(elem->timeout); -} - /* Set list without and with timeout */ static int @@ -146,11 +143,11 @@ list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, struct set_telem *e; for (; i < map->size; i++) { - e = (struct set_telem *)list_set_elem(map, i); + e = list_set_telem(map, i); swap(e->id, id); + swap(e->timeout, timeout); if (e->id == IPSET_INVALID_ID) break; - swap(e->timeout, timeout); } } @@ -164,7 +161,7 @@ list_set_add(struct list_set *map, u32 i, ip_set_id_t id, /* Last element replaced: e.g. add new,before,last */ ip_set_put_byindex(e->id); if (with_timeout(map->timeout)) - list_elem_tadd(map, i, id, timeout); + list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); else list_elem_add(map, i, id); @@ -172,11 +169,11 @@ list_set_add(struct list_set *map, u32 i, ip_set_id_t id, } static int -list_set_del(struct list_set *map, ip_set_id_t id, u32 i) +list_set_del(struct list_set *map, u32 i) { struct set_elem *a = list_set_elem(map, i), *b; - ip_set_put_byindex(id); + ip_set_put_byindex(a->id); for (; i < map->size - 1; i++) { b = list_set_elem(map, i + 1); @@ -308,11 +305,11 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[], (before == 0 || (before > 0 && next_id_eq(map, i, refid)))) - ret = list_set_del(map, id, i); + ret = list_set_del(map, i); else if (before < 0 && elem->id == refid && next_id_eq(map, i, id)) - ret = list_set_del(map, id, i + 1); + ret = list_set_del(map, i + 1); } break; default: @@ -460,17 +457,15 @@ list_set_gc(unsigned long ul_set) struct list_set *map = set->data; struct set_telem *e; u32 i; - - /* We run parallel with other readers (test element) - * but adding/deleting new entries is locked out */ - read_lock_bh(&set->lock); - for (i = map->size - 1; i >= 0; i--) { - e = (struct set_telem *) list_set_elem(map, i); - if (e->id != IPSET_INVALID_ID && - list_set_expired(map, i)) - list_set_del(map, e->id, i); + + /* nfnl_lock should be called */ + write_lock_bh(&set->lock); + for (i = 0; i < map->size; i++) { + e = list_set_telem(map, i); + if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) + list_set_del(map, i); } - read_unlock_bh(&set->lock); + write_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; add_timer(&map->gc); -- cgit From 2f9f28b212a2bd4948c8ceaaec33ce0123632129 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 4 Apr 2011 15:19:25 +0200 Subject: netfilter: ipset: references are protected by rwlock instead of mutex The timeout variant of the list:set type must reference the member sets. However, its garbage collector runs at timer interrupt so the mutex protection of the references is a no go. Therefore the reference protection is converted to rwlock. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- net/netfilter/ipset/ip_set_bitmap_ip.c | 3 +- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 3 +- net/netfilter/ipset/ip_set_bitmap_port.c | 3 +- net/netfilter/ipset/ip_set_core.c | 109 ++++++++++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 6 +- 5 files changed, 71 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index bca96990218d..a113ff066928 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -338,8 +338,7 @@ bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); if (map->netmask != 32) NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, - htonl(atomic_read(&set->ref) - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + map->memsize)); if (with_timeout(map->timeout)) diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 5e790172deff..00a33242e90c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -434,8 +434,7 @@ bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) goto nla_put_failure; NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, - htonl(atomic_read(&set->ref) - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + (map->last_ip - map->first_ip + 1) * map->dsize)); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 165f09b1a9cb..6b38eb8f6ed8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -320,8 +320,7 @@ bitmap_port_head(struct ip_set *set, struct sk_buff *skb) goto nla_put_failure; NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, - htonl(atomic_read(&set->ref) - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + map->memsize)); if (with_timeout(map->timeout)) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d6b48230a540..e88ac3c3ed07 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -26,6 +26,7 @@ static LIST_HEAD(ip_set_type_list); /* all registered set types */ static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ +static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ static struct ip_set **ip_set_list; /* all individual sets */ static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ @@ -301,13 +302,18 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); static inline void __ip_set_get(ip_set_id_t index) { - atomic_inc(&ip_set_list[index]->ref); + write_lock_bh(&ip_set_ref_lock); + ip_set_list[index]->ref++; + write_unlock_bh(&ip_set_ref_lock); } static inline void __ip_set_put(ip_set_id_t index) { - atomic_dec(&ip_set_list[index]->ref); + write_lock_bh(&ip_set_ref_lock); + BUG_ON(ip_set_list[index]->ref == 0); + ip_set_list[index]->ref--; + write_unlock_bh(&ip_set_ref_lock); } /* @@ -324,7 +330,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, struct ip_set *set = ip_set_list[index]; int ret = 0; - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); + BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (dim < set->type->dimension || @@ -356,7 +362,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, struct ip_set *set = ip_set_list[index]; int ret; - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); + BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (dim < set->type->dimension || @@ -378,7 +384,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, struct ip_set *set = ip_set_list[index]; int ret = 0; - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); + BUG_ON(set == NULL); pr_debug("set %s, index %u\n", set->name, index); if (dim < set->type->dimension || @@ -397,7 +403,6 @@ EXPORT_SYMBOL_GPL(ip_set_del); * Find set by name, reference it once. The reference makes sure the * thing pointed to, does not go away under our feet. * - * The nfnl mutex must already be activated. */ ip_set_id_t ip_set_get_byname(const char *name, struct ip_set **set) @@ -423,15 +428,12 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname); * reference count by 1. The caller shall not assume the index * to be valid, after calling this function. * - * The nfnl mutex must already be activated. */ void ip_set_put_byindex(ip_set_id_t index) { - if (ip_set_list[index] != NULL) { - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); + if (ip_set_list[index] != NULL) __ip_set_put(index); - } } EXPORT_SYMBOL_GPL(ip_set_put_byindex); @@ -441,7 +443,6 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex); * can't be destroyed. The set cannot be renamed due to * the referencing either. * - * The nfnl mutex must already be activated. */ const char * ip_set_name_byindex(ip_set_id_t index) @@ -449,7 +450,7 @@ ip_set_name_byindex(ip_set_id_t index) const struct ip_set *set = ip_set_list[index]; BUG_ON(set == NULL); - BUG_ON(atomic_read(&set->ref) == 0); + BUG_ON(set->ref == 0); /* Referenced, so it's safe */ return set->name; @@ -515,10 +516,7 @@ void ip_set_nfnl_put(ip_set_id_t index) { nfnl_lock(); - if (ip_set_list[index] != NULL) { - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); - __ip_set_put(index); - } + ip_set_put_byindex(index); nfnl_unlock(); } EXPORT_SYMBOL_GPL(ip_set_nfnl_put); @@ -526,7 +524,7 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_put); /* * Communication protocol with userspace over netlink. * - * We already locked by nfnl_lock. + * The commands are serialized by the nfnl mutex. */ static inline bool @@ -657,7 +655,6 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, return -ENOMEM; rwlock_init(&set->lock); strlcpy(set->name, name, IPSET_MAXNAMELEN); - atomic_set(&set->ref, 0); set->family = family; /* @@ -690,8 +687,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb, /* * Here, we have a valid, constructed set and we are protected - * by nfnl_lock. Find the first free index in ip_set_list and - * check clashing. + * by the nfnl mutex. Find the first free index in ip_set_list + * and check clashing. */ if ((ret = find_free_id(set->name, &index, &clash)) != 0) { /* If this is the same set and requested, ignore error */ @@ -751,31 +748,51 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, const struct nlattr * const attr[]) { ip_set_id_t i; + int ret = 0; if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* References are protected by the nfnl mutex */ + /* Commands are serialized and references are + * protected by the ip_set_ref_lock. + * External systems (i.e. xt_set) must call + * ip_set_put|get_nfnl_* functions, that way we + * can safely check references here. + * + * list:set timer can only decrement the reference + * counter, so if it's already zero, we can proceed + * without holding the lock. + */ + read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < ip_set_max; i++) { - if (ip_set_list[i] != NULL && - (atomic_read(&ip_set_list[i]->ref))) - return -IPSET_ERR_BUSY; + if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { + ret = IPSET_ERR_BUSY; + goto out; + } } + read_unlock_bh(&ip_set_ref_lock); for (i = 0; i < ip_set_max; i++) { if (ip_set_list[i] != NULL) ip_set_destroy_set(i); } } else { i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); - if (i == IPSET_INVALID_ID) - return -ENOENT; - else if (atomic_read(&ip_set_list[i]->ref)) - return -IPSET_ERR_BUSY; + if (i == IPSET_INVALID_ID) { + ret = -ENOENT; + goto out; + } else if (ip_set_list[i]->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + read_unlock_bh(&ip_set_ref_lock); ip_set_destroy_set(i); } return 0; +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; } /* Flush sets */ @@ -834,6 +851,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set; const char *name2; ip_set_id_t i; + int ret = 0; if (unlikely(protocol_failed(attr) || attr[IPSET_ATTR_SETNAME] == NULL || @@ -843,25 +861,33 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb, set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); if (set == NULL) return -ENOENT; - if (atomic_read(&set->ref) != 0) - return -IPSET_ERR_REFERENCED; + + read_lock_bh(&ip_set_ref_lock); + if (set->ref != 0) { + ret = -IPSET_ERR_REFERENCED; + goto out; + } name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); for (i = 0; i < ip_set_max; i++) { if (ip_set_list[i] != NULL && - STREQ(ip_set_list[i]->name, name2)) - return -IPSET_ERR_EXIST_SETNAME2; + STREQ(ip_set_list[i]->name, name2)) { + ret = -IPSET_ERR_EXIST_SETNAME2; + goto out; + } } strncpy(set->name, name2, IPSET_MAXNAMELEN); - return 0; +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; } /* Swap two sets so that name/index points to the other. * References and set names are also swapped. * - * We are protected by the nfnl mutex and references are - * manipulated only by holding the mutex. The kernel interfaces + * The commands are serialized by the nfnl mutex and references are + * protected by the ip_set_ref_lock. The kernel interfaces * do not hold the mutex but the pointer settings are atomic * so the ip_set_list always contains valid pointers to the sets. */ @@ -874,7 +900,6 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; - u32 from_ref; if (unlikely(protocol_failed(attr) || attr[IPSET_ATTR_SETNAME] == NULL || @@ -899,17 +924,15 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb, from->type->family == to->type->family)) return -IPSET_ERR_TYPE_MISMATCH; - /* No magic here: ref munging protected by the nfnl_lock */ strncpy(from_name, from->name, IPSET_MAXNAMELEN); - from_ref = atomic_read(&from->ref); - strncpy(from->name, to->name, IPSET_MAXNAMELEN); - atomic_set(&from->ref, atomic_read(&to->ref)); strncpy(to->name, from_name, IPSET_MAXNAMELEN); - atomic_set(&to->ref, from_ref); + write_lock_bh(&ip_set_ref_lock); + swap(from->ref, to->ref); ip_set_list[from_id] = to; ip_set_list[to_id] = from; + write_unlock_bh(&ip_set_ref_lock); return 0; } @@ -926,7 +949,7 @@ ip_set_dump_done(struct netlink_callback *cb) { if (cb->args[2]) { pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); - __ip_set_put((ip_set_id_t) cb->args[1]); + ip_set_put_byindex((ip_set_id_t) cb->args[1]); } return 0; } @@ -1068,7 +1091,7 @@ release_refcount: /* If there was an error or set is done, release set */ if (ret || !cb->args[2]) { pr_debug("release set %s\n", ip_set_list[index]->name); - __ip_set_put(index); + ip_set_put_byindex(index); } /* If we dump all sets, continue with dumping last ones */ diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index f4a46c0d25f3..e9159e99fc4b 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -366,8 +366,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); if (with_timeout(map->timeout)) NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, - htonl(atomic_read(&set->ref) - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(sizeof(*map) + map->size * map->dsize)); ipset_nest_end(skb, nested); @@ -457,8 +456,7 @@ list_set_gc(unsigned long ul_set) struct list_set *map = set->data; struct set_telem *e; u32 i; - - /* nfnl_lock should be called */ + write_lock_bh(&set->lock); for (i = 0; i < map->size; i++) { e = list_set_telem(map, i); -- cgit From b4232a22776aa5d063f890d21ca69870dbbe431b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 4 Apr 2011 15:21:02 +0200 Subject: netfilter: h323: bug in parsing of ASN1 SEQOF field Static analyzer of clang found a dead store which appears to be a bug in reading count of items in SEQOF field, only the lower byte of word is stored. This may lead to corrupted read and communication shutdown. The bug has been in the module since it's first inclusion into linux kernel. [Patrick: the bug is real, but without practical consequence since the largest amount of sequence-of members we parse is 30.] Signed-off-by: David Sterba Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_h323_asn1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 867882313e49..bcd5ed6b7130 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -631,7 +631,7 @@ static int decode_seqof(bitstr_t *bs, const struct field_t *f, CHECK_BOUND(bs, 2); count = *bs->cur++; count <<= 8; - count = *bs->cur++; + count += *bs->cur++; break; case SEMI: BYTE_ALIGN(bs); -- cgit From a09d19779f3ffac6e16821accc2c1cc4df1b643a Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Mon, 4 Apr 2011 15:25:18 +0200 Subject: IPVS: fix NULL ptr dereference in ip_vs_ctl.c ip_vs_genl_dump_daemons() ipvsadm -ln --daemon will trigger a Null pointer exception because ip_vs_genl_dump_daemons() uses skb_net() instead of skb_sknet(). To prevent others from NULL ptr a check is made in ip_vs.h skb_net(). Signed-off-by: Hans Schillstrom Signed-off-by: Simon Horman Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 33733c8872e7..ae47090bf45f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3120,7 +3120,7 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = skb_net(skb); + struct net *net = skb_sknet(skb); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); -- cgit From 31ad3dd64e689bc79dd819f8f134b9b025240eb8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 16:56:29 +0200 Subject: netfilter: af_info: add network namespace parameter to route hook This is required to eventually replace the rt6_lookup call in xt_addrtype.c with nf_afinfo->route(). Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- net/ipv4/netfilter.c | 5 +++-- net/ipv6/netfilter.c | 5 +++-- net/netfilter/nf_conntrack_h323_main.c | 8 ++++---- net/netfilter/xt_TCPMSS.c | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f3c0b549b8e1..f1035f056503 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -221,9 +221,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; } -static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip_route(struct net *net, struct dst_entry **dst, + struct flowi *fl) { - struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4); + struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) return PTR_ERR(rt); *dst = &rt->dst; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 39aaca2b4fd2..e008b9b4a779 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -90,9 +90,10 @@ static int nf_ip6_reroute(struct sk_buff *skb, return 0; } -static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl) { - *dst = ip6_route_output(&init_net, NULL, &fl->u.ip6); + *dst = ip6_route_output(net, NULL, &fl->u.ip6); return (*dst)->error; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 533a183e6661..39a453895b4d 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -731,9 +731,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->ip; - if (!afinfo->route((struct dst_entry **)&rt1, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, flowi4_to_flowi(&fl1))) { - if (!afinfo->route((struct dst_entry **)&rt2, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi4_to_flowi(&fl2))) { if (rt1->rt_gateway == rt2->rt_gateway && rt1->dst.dev == rt2->dst.dev) @@ -755,9 +755,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); ipv6_addr_copy(&fl2.daddr, &dst->in6); - if (!afinfo->route((struct dst_entry **)&rt1, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, flowi6_to_flowi(&fl1))) { - if (!afinfo->route((struct dst_entry **)&rt2, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2))) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 6e6b46cb1db9..8690125e3b18 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -166,7 +166,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_lock(); ai = nf_get_afinfo(family); if (ai != NULL) - ai->route((struct dst_entry **)&rt, &fl); + ai->route(&init_net, (struct dst_entry **)&rt, &fl); rcu_read_unlock(); if (rt != NULL) { -- cgit From 0fae2e7740aca7e384c5f337f458897e7e337d58 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 17:00:54 +0200 Subject: netfilter: af_info: add 'strict' parameter to limit lookup to .oif ipv6 fib lookup can set RT6_LOOKUP_F_IFACE flag to restrict search to an interface, but this flag cannot be set via struct flowi. Also, it cannot be set via ip6_route_output: this function uses the passed sock struct to determine if this flag is required (by testing for nonzero sk_bound_dev_if). Work around this by passing in an artificial struct sk in case 'strict' argument is true. This is required to replace the rt6_lookup call in xt_addrtype.c with nf_afinfo->route(). Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- net/ipv4/netfilter.c | 2 +- net/ipv6/netfilter.c | 12 ++++++++++-- net/netfilter/nf_conntrack_h323_main.c | 8 ++++---- net/netfilter/xt_TCPMSS.c | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f1035f056503..4614babdc45f 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -222,7 +222,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, } static int nf_ip_route(struct net *net, struct dst_entry **dst, - struct flowi *fl) + struct flowi *fl, bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index e008b9b4a779..28bc1f644b7b 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -91,9 +91,17 @@ static int nf_ip6_reroute(struct sk_buff *skb, } static int nf_ip6_route(struct net *net, struct dst_entry **dst, - struct flowi *fl) + struct flowi *fl, bool strict) { - *dst = ip6_route_output(net, NULL, &fl->u.ip6); + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + + *dst = ip6_route_output(net, sk, &fl->u.ip6); return (*dst)->error; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 39a453895b4d..18b2ce5c8ced 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -732,9 +732,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->ip; if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, - flowi4_to_flowi(&fl1))) { + flowi4_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, - flowi4_to_flowi(&fl2))) { + flowi4_to_flowi(&fl2), false)) { if (rt1->rt_gateway == rt2->rt_gateway && rt1->dst.dev == rt2->dst.dev) ret = 1; @@ -756,9 +756,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); ipv6_addr_copy(&fl2.daddr, &dst->in6); if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, - flowi6_to_flowi(&fl1))) { + flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, - flowi6_to_flowi(&fl2))) { + flowi6_to_flowi(&fl2), false)) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && rt1->dst.dev == rt2->dst.dev) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 8690125e3b18..9e63b43faeed 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -166,7 +166,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_lock(); ai = nf_get_afinfo(family); if (ai != NULL) - ai->route(&init_net, (struct dst_entry **)&rt, &fl); + ai->route(&init_net, (struct dst_entry **)&rt, &fl, false); rcu_read_unlock(); if (rt != NULL) { -- cgit From b7225041e93f81e7e38fcdf27fc82044e7695efd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 17:01:43 +0200 Subject: netfilter: xt_addrtype: replace rt6_lookup with nf_afinfo->route This avoids pulling in the ipv6 module when using (ipv4-only) iptables -m addrtype. Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 1 - net/netfilter/xt_addrtype.c | 42 ++++++++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c3f988aa1152..32bff6d86cb2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -652,7 +652,6 @@ comment "Xtables matches" config NETFILTER_XT_MATCH_ADDRTYPE tristate '"addrtype" address type match support' depends on NETFILTER_ADVANCED - depends on (IPV6 || IPV6=n) ---help--- This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 2220b85e9519..b77d383cec78 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -32,11 +32,32 @@ MODULE_ALIAS("ipt_addrtype"); MODULE_ALIAS("ip6t_addrtype"); #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static u32 xt_addrtype_rt6_to_type(const struct rt6_info *rt) +static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr) { + const struct nf_afinfo *afinfo; + struct flowi6 flow; + struct rt6_info *rt; u32 ret; + int route_err; - if (!rt) + memset(&flow, 0, sizeof(flow)); + ipv6_addr_copy(&flow.daddr, addr); + if (dev) + flow.flowi6_oif = dev->ifindex; + + rcu_read_lock(); + + afinfo = nf_get_afinfo(NFPROTO_IPV6); + if (afinfo != NULL) + route_err = afinfo->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), !!dev); + else + route_err = 1; + + rcu_read_unlock(); + + if (route_err) return XT_ADDRTYPE_UNREACHABLE; if (rt->rt6i_flags & RTF_REJECT) @@ -48,6 +69,9 @@ static u32 xt_addrtype_rt6_to_type(const struct rt6_info *rt) ret |= XT_ADDRTYPE_LOCAL; if (rt->rt6i_flags & RTF_ANYCAST) ret |= XT_ADDRTYPE_ANYCAST; + + + dst_release(&rt->dst); return ret; } @@ -65,18 +89,8 @@ static bool match_type6(struct net *net, const struct net_device *dev, return false; if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | - XT_ADDRTYPE_UNREACHABLE) & mask) { - struct rt6_info *rt; - u32 type; - int ifindex = dev ? dev->ifindex : 0; - - rt = rt6_lookup(net, addr, NULL, ifindex, !!dev); - - type = xt_addrtype_rt6_to_type(rt); - - dst_release(&rt->dst); - return !!(mask & type); - } + XT_ADDRTYPE_UNREACHABLE) & mask) + return !!(mask & match_lookup_rt6(net, dev, addr)); return true; } -- cgit From 96120d86fe302c006259baee9061eea9e1b9e486 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 17:06:21 +0200 Subject: netfilter: xt_conntrack: fix inverted conntrack direction test --ctdir ORIGINAL matches REPLY packets, and vv: userspace sets "invert_flags &= ~XT_CONNTRACK_DIRECTION" in ORIGINAL case. Thus: (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ !!(info->invert_flags & XT_CONNTRACK_DIRECTION)) yields "1 ^ 0", which is true -> returns false. Reproducer: iptables -I OUTPUT 1 -p tcp --syn -m conntrack --ctdir ORIGINAL Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/xt_conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 2c0086a4751e..481a86fdc409 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -195,7 +195,7 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, return info->match_flags & XT_CONNTRACK_STATE; if ((info->match_flags & XT_CONNTRACK_DIRECTION) && (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ - !!(info->invert_flags & XT_CONNTRACK_DIRECTION)) + !(info->invert_flags & XT_CONNTRACK_DIRECTION)) return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) -- cgit