From 6f6c7006755b667f9f6c1f3b6f08cd65f75cc471 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 17 Jan 2011 20:34:08 -0800 Subject: libceph: fix osd request queuing on osdmap updates If we send a request to osd A, and the request's pg remaps to osd B and then back to A in quick succession, we need to resend the request to A. The old code was only calling kick_requests after processing all incremental maps in a message, so it was very possible to not resend a request that needed to be resent. This would make the osd eventually time out (at least with the current default of osd timeouts enabled). The correct approach is to scan requests on every map incremental. This patch refactors the kick code in a few ways: - all requests are either on req_lru (in flight), req_unsent (ready to send), or req_notarget (currently map to no up osd) - mapping always done by map_request (previous map_osds) - if the mapping changes, we requeue. requests are resent only after all map incrementals are processed. - some osd reset code is moved out of kick_requests into a separate function - the "kick this osd" functionality is moved to kick_osd_requests, as it is unrelated to scanning for request->pg->osd mapping changes Signed-off-by: Sage Weil --- include/linux/ceph/osd_client.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a1af29648fb5..e791b8e46353 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -74,7 +74,6 @@ struct ceph_osd_request { char r_oid[40]; /* object name */ int r_oid_len; unsigned long r_stamp; /* send OR check time */ - bool r_resend; /* msg send failed, needs retry */ struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ @@ -104,7 +103,9 @@ struct ceph_osd_client { u64 timeout_tid; /* tid of timeout triggering rq */ u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* pending requests lru */ + struct list_head req_lru; /* in-flight lru */ + struct list_head req_unsent; /* unsent/need-resend queue */ + struct list_head req_notarget; /* map to no osd */ int num_requests; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; -- cgit From 483fac71485e5063ff4033b6dc7d91567f1b6ff1 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 20 Jan 2011 16:36:06 -0800 Subject: ceph: update common header files This updates the common header files used by the different ceph related modules. Specifically it adds definitions required by the rbd watch/notify feature. Signed-off-by: Yehuda Sadeh --- include/linux/ceph/ceph_fs.h | 19 +++++++++++++++---- include/linux/ceph/rados.h | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 09dcc0c2ffd5..b8e995fbd867 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -136,9 +136,18 @@ struct ceph_dir_layout { /* osd */ -#define CEPH_MSG_OSD_MAP 41 -#define CEPH_MSG_OSD_OP 42 -#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_WATCH_NOTIFY 44 + + +/* watch-notify operations */ +enum { + WATCH_NOTIFY = 1, /* notifying watcher */ + WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ +}; + /* pool operations */ enum { @@ -213,8 +222,10 @@ struct ceph_client_mount { struct ceph_mon_request_header monhdr; } __attribute__ ((packed)); +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ + struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; + __le64 have_version; __le64 have; __u8 onetime; } __attribute__ ((packed)); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 6d5247f2e81b..0a99099801a4 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -12,9 +12,9 @@ * osdmap encoding versions */ #define CEPH_OSDMAP_INC_VERSION 5 -#define CEPH_OSDMAP_INC_VERSION_EXT 5 +#define CEPH_OSDMAP_INC_VERSION_EXT 6 #define CEPH_OSDMAP_VERSION 5 -#define CEPH_OSDMAP_VERSION_EXT 5 +#define CEPH_OSDMAP_VERSION_EXT 6 /* * fs id @@ -181,9 +181,17 @@ enum { /* read */ CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, + CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, /* fancy read */ - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, + CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5, + + CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6, + CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7, + + /* versioning */ + CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8, /* write */ CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, @@ -205,6 +213,8 @@ enum { CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, + CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, + /** attrs **/ /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -218,11 +228,14 @@ enum { CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, /** subop **/ - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, + CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, + CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, + CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, + CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, + CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, + CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, + CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, + CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, /** lock **/ CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, @@ -328,6 +341,8 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; +#define RADOS_NOTIFY_VER 1 + /* * an individual object operation. each may be accompanied by some data * payload @@ -359,7 +374,12 @@ struct ceph_osd_op { struct { __le64 snapid; } __attribute__ ((packed)) snap; - }; + struct { + __le64 cookie; + __le64 ver; + __u8 flag; /* 0 = unwatch, 1 = watch */ + } __attribute__ ((packed)) watch; +}; __le32 payload_len; } __attribute__ ((packed)); @@ -402,4 +422,5 @@ struct ceph_osd_reply_head { } __attribute__ ((packed)); + #endif -- cgit From 80456f8672f7e69d05c01627da03587dc1ea1603 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Mar 2011 13:33:26 -0800 Subject: ceph: move readahead default to fs/ceph from libceph Signed-off-by: Sage Weil --- include/linux/ceph/libceph.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 72c72bfccb88..0d2e0fffb470 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -71,7 +71,6 @@ struct ceph_options { #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) -- cgit From a40c4f10e3fb96030358e49abd010c1f08446fa3 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Mon, 21 Mar 2011 15:07:16 -0700 Subject: libceph: add lingering request and watch/notify event framework Lingering requests are requests that are sent to the OSD normally but tracked also after we get a successful request. This keeps the OSD connection open and resends the original request if the object moves to another OSD. The OSD can then send notification messages back to us if another client initiates a notify. This framework will be used by RBD so that the client gets notification when a snapshot is created by another node or tool. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- include/linux/ceph/osd_client.h | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index e791b8e46353..f88eacb111d4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -32,6 +32,7 @@ struct ceph_osd { struct rb_node o_node; struct ceph_connection o_con; struct list_head o_requests; + struct list_head o_linger_requests; struct list_head o_osd_lru; struct ceph_authorizer *o_authorizer; void *o_authorizer_buf, *o_authorizer_reply_buf; @@ -47,6 +48,8 @@ struct ceph_osd_request { struct rb_node r_node; struct list_head r_req_lru_item; struct list_head r_osd_item; + struct list_head r_linger_item; + struct list_head r_linger_osd; struct ceph_osd *r_osd; struct ceph_pg r_pgid; int r_pg_osds[CEPH_PG_MAX_SIZE]; @@ -59,6 +62,7 @@ struct ceph_osd_request { int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ int r_got_reply; + int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; @@ -89,6 +93,26 @@ struct ceph_osd_request { struct ceph_pagelist *r_trail; /* trailing part of the data */ }; +struct ceph_osd_event { + u64 cookie; + int one_shot; + struct ceph_osd_client *osdc; + void (*cb)(u64, u64, u8, void *); + void *data; + struct rb_node node; + struct list_head osd_node; + struct kref kref; + struct completion completion; +}; + +struct ceph_osd_event_work { + struct work_struct work; + struct ceph_osd_event *event; + u64 ver; + u64 notify_id; + u8 opcode; +}; + struct ceph_osd_client { struct ceph_client *client; @@ -106,6 +130,7 @@ struct ceph_osd_client { struct list_head req_lru; /* in-flight lru */ struct list_head req_unsent; /* unsent/need-resend queue */ struct list_head req_notarget; /* map to no osd */ + struct list_head req_linger; /* lingering requests */ int num_requests; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; @@ -117,6 +142,12 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; + + spinlock_t event_lock; + struct rb_root event_tree; + u64 event_count; + + struct workqueue_struct *notify_wq; }; struct ceph_osd_req_op { @@ -151,6 +182,13 @@ struct ceph_osd_req_op { struct { u64 snapid; } snap; + struct { + u64 cookie; + u64 ver; + __u8 flag; + u32 prot_ver; + u32 timeout; + } watch; }; u32 payload_len; }; @@ -199,6 +237,11 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, bool use_mempool, int num_reply, int page_align); +extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); + static inline void ceph_osdc_get_request(struct ceph_osd_request *req) { kref_get(&req->r_kref); @@ -234,5 +277,14 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct page **pages, int nr_pages, int flags, int do_sync, bool nofail); +/* watch/notify events */ +extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + int one_shot, void *data, + struct ceph_osd_event **pevent); +extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); +extern int ceph_osdc_wait_event(struct ceph_osd_event *event, + unsigned long timeout); +extern void ceph_osdc_put_event(struct ceph_osd_event *event); #endif -- cgit