diff options
| -rw-r--r-- | Documentation/md-cluster.txt | 176 | ||||
| -rw-r--r-- | drivers/md/Kconfig | 16 | ||||
| -rw-r--r-- | drivers/md/Makefile | 1 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 189 | ||||
| -rw-r--r-- | drivers/md/bitmap.h | 10 | ||||
| -rw-r--r-- | drivers/md/md-cluster.c | 965 | ||||
| -rw-r--r-- | drivers/md/md-cluster.h | 29 | ||||
| -rw-r--r-- | drivers/md/md.c | 353 | ||||
| -rw-r--r-- | drivers/md/md.h | 24 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 20 | ||||
| -rw-r--r-- | include/uapi/linux/raid/md_p.h | 7 | ||||
| -rw-r--r-- | include/uapi/linux/raid/md_u.h | 1 | 
12 files changed, 1709 insertions, 82 deletions
| diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt new file mode 100644 index 000000000000..de1af7db3355 --- /dev/null +++ b/Documentation/md-cluster.txt @@ -0,0 +1,176 @@ +The cluster MD is a shared-device RAID for a cluster. + + +1. On-disk format + +Separate write-intent-bitmap are used for each cluster node. +The bitmaps record all writes that may have been started on that node, +and may not yet have finished. The on-disk layout is: + +0                    4k                     8k                    12k +------------------------------------------------------------------- +| idle                | md super            | bm super [0] + bits | +| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   | +| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  | +| bm bits [3, contd]  |                     |                     | + +During "normal" functioning we assume the filesystem ensures that only one +node writes to any given block at a time, so a write +request will + - set the appropriate bit (if not already set) + - commit the write to all mirrors + - schedule the bit to be cleared after a timeout. + +Reads are just handled normally.  It is up to the filesystem to +ensure one node doesn't read from a location where another node (or the same +node) is writing. + + +2. DLM Locks for management + +There are two locks for managing the device: + +2.1 Bitmap lock resource (bm_lockres) + + The bm_lockres protects individual node bitmaps. They are named in the + form bitmap001 for node 1, bitmap002 for node and so on. When a node + joins the cluster, it acquires the lock in PW mode and it stays so + during the lifetime the node is part of the cluster. The lock resource + number is based on the slot number returned by the DLM subsystem. Since + DLM starts node count from one and bitmap slots start from zero, one is + subtracted from the DLM slot number to arrive at the bitmap slot number. + +3. Communication + +Each node has to communicate with other nodes when starting or ending +resync, and metadata superblock updates. + +3.1 Message Types + + There are 3 types, of messages which are passed + + 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been +   updated, and the node must re-read the md superblock. This is performed +   synchronously. + + 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended +   so that each node may suspend or resume the region. + +3.2 Communication mechanism + + The DLM LVB is used to communicate within nodes of the cluster. There + are three resources used for the purpose: + +  3.2.1 Token: The resource which protects the entire communication +   system. The node having the token resource is allowed to +   communicate. + +  3.2.2 Message: The lock resource which carries the data to +   communicate. + +  3.2.3 Ack: The resource, acquiring which means the message has been +   acknowledged by all nodes in the cluster. The BAST of the resource +   is used to inform the receive node that a node wants to communicate. + +The algorithm is: + + 1. receive status + +   sender                         receiver                   receiver +   ACK:CR                          ACK:CR                     ACK:CR + + 2. sender get EX of TOKEN +    sender get EX of MESSAGE +    sender                        receiver                 receiver +    TOKEN:EX                       ACK:CR                   ACK:CR +    MESSAGE:EX +    ACK:CR + +    Sender checks that it still needs to send a message. Messages received +    or other events that happened while waiting for the TOKEN may have made +    this message inappropriate or redundant. + + 3. sender write LVB. +    sender down-convert MESSAGE from EX to CR +    sender try to get EX of ACK +    [ wait until all receiver has *processed* the MESSAGE ] + +                                     [ triggered by bast of ACK ] +                                     receiver get CR of MESSAGE +                                     receiver read LVB +                                     receiver processes the message +                                     [ wait finish ] +                                     receiver release ACK + +   sender                         receiver                   receiver +   TOKEN:EX                       MESSAGE:CR                 MESSAGE:CR +   MESSAGE:CR +   ACK:EX + + 4. triggered by grant of EX on ACK (indicating all receivers have processed +    message) +    sender down-convert ACK from EX to CR +    sender release MESSAGE +    sender release TOKEN +                               receiver upconvert to EX of MESSAGE +                               receiver get CR of ACK +                               receiver release MESSAGE + +   sender                      receiver                   receiver +   ACK:CR                       ACK:CR                     ACK:CR + + +4. Handling Failures + +4.1 Node Failure + When a node fails, the DLM informs the cluster with the slot. The node + starts a cluster recovery thread. The cluster recovery thread: +	- acquires the bitmap<number> lock of the failed node +	- opens the bitmap +	- reads the bitmap of the failed node +	- copies the set bitmap to local node +	- cleans the bitmap of the failed node +	- releases bitmap<number> lock of the failed node +	- initiates resync of the bitmap on the current node + + The resync process, is the regular md resync. However, in a clustered + environment when a resync is performed, it needs to tell other nodes + of the areas which are suspended. Before a resync starts, the node + send out RESYNC_START with the (lo,hi) range of the area which needs + to be suspended. Each node maintains a suspend_list, which contains + the list  of ranges which are currently suspended. On receiving + RESYNC_START, the node adds the range to the suspend_list. Similarly, + when the node performing resync finishes, it send RESYNC_FINISHED + to other nodes and other nodes remove the corresponding entry from + the suspend_list. + + A helper function, should_suspend() can be used to check if a particular + I/O range should be suspended or not. + +4.2 Device Failure + Device failures are handled and communicated with the metadata update + routine. + +5. Adding a new Device +For adding a new device, it is necessary that all nodes "see" the new device +to be added. For this, the following algorithm is used: + +    1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues +       ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) +    2. Node 1 sends NEWDISK with uuid and slot number +    3. Other nodes issue kobject_uevent_env with uuid and slot number +       (Steps 4,5 could be a udev rule) +    4. In userspace, the node searches for the disk, perhaps +       using blkid -t SUB_UUID="" +    5. Other nodes issue either of the following depending on whether the disk +       was found: +       ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and +                disc.number set to slot number) +       ioctl(CLUSTERED_DISK_NACK) +    6. Other nodes drop lock on no-new-devs (CR) if device is found +    7. Node 1 attempts EX lock on no-new-devs +    8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk +       as SpareLocal +    9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED +    10. Other nodes get the information whether a disk is added or not +	by the following METADATA_UPDATED. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 63e05e32b462..eed1fec2d97b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -175,6 +175,22 @@ config MD_FAULTY  	  In unsure, say N. + +config MD_CLUSTER +	tristate "Cluster Support for MD (EXPERIMENTAL)" +	depends on BLK_DEV_MD +	depends on DLM +	default n +	---help--- +	Clustering support for MD devices. This enables locking and +	synchronization across multiple systems on the cluster, so all +	nodes in the cluster can access the MD devices simultaneously. + +	This brings the redundancy (and uptime) of RAID levels across the +	nodes of the cluster. + +	If unsure, say N. +  source "drivers/md/bcache/Kconfig"  config BLK_DEV_DM_BUILTIN diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a2da532b1c2b..7ed86876f3b7 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o  obj-$(CONFIG_MD_RAID456)	+= raid456.o  obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o  obj-$(CONFIG_MD_FAULTY)		+= faulty.o +obj-$(CONFIG_MD_CLUSTER)	+= md-cluster.o  obj-$(CONFIG_BCACHE)		+= bcache/  obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o  obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3a5767968ba0..2bc56e2a3526 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)  	struct block_device *bdev;  	struct mddev *mddev = bitmap->mddev;  	struct bitmap_storage *store = &bitmap->storage; +	int node_offset = 0; + +	if (mddev_is_clustered(bitmap->mddev)) +		node_offset = bitmap->cluster_slot * store->file_pages;  	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {  		int size = PAGE_SIZE; @@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap)  	/* This might have been changed by a reshape */  	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);  	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); +	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);  	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->  					   bitmap_info.space);  	kunmap_atomic(sb); @@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)  	bitmap_super_t *sb;  	unsigned long chunksize, daemon_sleep, write_behind;  	unsigned long long events; +	int nodes = 0;  	unsigned long sectors_reserved = 0;  	int err = -EINVAL;  	struct page *sb_page; @@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap)  		return -ENOMEM;  	bitmap->storage.sb_page = sb_page; +re_read: +	/* If cluster_slot is set, the cluster is setup */ +	if (bitmap->cluster_slot >= 0) { +		sector_t bm_blocks = bitmap->mddev->resync_max_sectors; + +		sector_div(bm_blocks, +			   bitmap->mddev->bitmap_info.chunksize >> 9); +		/* bits to bytes */ +		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); +		/* to 4k blocks */ +		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); +		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); +		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, +			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); +	} +  	if (bitmap->storage.file) {  		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);  		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; @@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)  	if (err)  		return err; +	err = -EINVAL;  	sb = kmap_atomic(sb_page);  	chunksize = le32_to_cpu(sb->chunksize);  	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;  	write_behind = le32_to_cpu(sb->write_behind);  	sectors_reserved = le32_to_cpu(sb->sectors_reserved); +	nodes = le32_to_cpu(sb->nodes); +	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);  	/* verify that the bitmap-specific fields are valid */  	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)  			goto out;  		}  		events = le64_to_cpu(sb->events); -		if (events < bitmap->mddev->events) { +		if (!nodes && (events < bitmap->mddev->events)) {  			printk(KERN_INFO  			       "%s: bitmap file is out of date (%llu < %llu) "  			       "-- forcing full recovery\n", @@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap)  	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)  		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);  	bitmap->events_cleared = le64_to_cpu(sb->events_cleared); +	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);  	err = 0; +  out:  	kunmap_atomic(sb); +	/* Assiging chunksize is required for "re_read" */ +	bitmap->mddev->bitmap_info.chunksize = chunksize; +	if (nodes && (bitmap->cluster_slot < 0)) { +		err = md_setup_cluster(bitmap->mddev, nodes); +		if (err) { +			pr_err("%s: Could not setup cluster service (%d)\n", +					bmname(bitmap), err); +			goto out_no_sb; +		} +		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); +		goto re_read; +	} + +  out_no_sb:  	if (test_bit(BITMAP_STALE, &bitmap->flags))  		bitmap->events_cleared = bitmap->mddev->events;  	bitmap->mddev->bitmap_info.chunksize = chunksize;  	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;  	bitmap->mddev->bitmap_info.max_write_behind = write_behind; +	bitmap->mddev->bitmap_info.nodes = nodes;  	if (bitmap->mddev->bitmap_info.space == 0 ||  	    bitmap->mddev->bitmap_info.space > sectors_reserved)  		bitmap->mddev->bitmap_info.space = sectors_reserved; -	if (err) +	if (err) {  		bitmap_print_sb(bitmap); +		if (bitmap->cluster_slot < 0) +			md_cluster_stop(bitmap->mddev); +	}  	return err;  } @@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,  }  static int bitmap_storage_alloc(struct bitmap_storage *store, -				unsigned long chunks, int with_super) +				unsigned long chunks, int with_super, +				int slot_number)  { -	int pnum; +	int pnum, offset = 0;  	unsigned long num_pages;  	unsigned long bytes; @@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,  		bytes += sizeof(bitmap_super_t);  	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); +	offset = slot_number * (num_pages - 1);  	store->filemap = kmalloc(sizeof(struct page *)  				 * num_pages, GFP_KERNEL); @@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,  		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);  		if (store->sb_page == NULL)  			return -ENOMEM; -		store->sb_page->index = 0;  	} +  	pnum = 0;  	if (store->sb_page) {  		store->filemap[0] = store->sb_page;  		pnum = 1; +		store->sb_page->index = offset;  	} +  	for ( ; pnum < num_pages; pnum++) {  		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);  		if (!store->filemap[pnum]) {  			store->file_pages = pnum;  			return -ENOMEM;  		} -		store->filemap[pnum]->index = pnum; +		store->filemap[pnum]->index = pnum + offset;  	}  	store->file_pages = pnum; @@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)  	}  } +static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +{ +	unsigned long bit; +	struct page *page; +	void *paddr; +	unsigned long chunk = block >> bitmap->counts.chunkshift; +	int set = 0; + +	page = filemap_get_page(&bitmap->storage, chunk); +	if (!page) +		return -EINVAL; +	bit = file_page_offset(&bitmap->storage, chunk); +	paddr = kmap_atomic(page); +	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) +		set = test_bit(bit, paddr); +	else +		set = test_bit_le(bit, paddr); +	kunmap_atomic(paddr); +	return set; +} + +  /* this gets called when the md device is ready to unplug its underlying   * (slave) device queues -- before we let any writes go down, we need to   * sync the dirty pages of the bitmap file to disk */ @@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n   */  static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)  { -	unsigned long i, chunks, index, oldindex, bit; +	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;  	struct page *page = NULL;  	unsigned long bit_cnt = 0;  	struct file *file; @@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)  	if (!bitmap->mddev->bitmap_info.external)  		offset = sizeof(bitmap_super_t); +	if (mddev_is_clustered(bitmap->mddev)) +		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); +  	for (i = 0; i < chunks; i++) {  		int b;  		index = file_page_index(&bitmap->storage, i); @@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)  					bitmap->mddev,  					bitmap->mddev->bitmap_info.offset,  					page, -					index, count); +					index + node_offset, count);  			if (ret)  				goto err; @@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev)  	     j < bitmap->storage.file_pages  		     && !test_bit(BITMAP_STALE, &bitmap->flags);  	     j++) { -  		if (test_page_attr(bitmap, j,  				   BITMAP_PAGE_DIRTY))  			/* bitmap_unplug will handle the rest */ @@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n  		return;  	}  	if (!*bmc) { -		*bmc = 2 | (needed ? NEEDED_MASK : 0); +		*bmc = 2;  		bitmap_count_page(&bitmap->counts, offset, 1);  		bitmap_set_pending(&bitmap->counts, offset);  		bitmap->allclean = 0;  	} +	if (needed) +		*bmc |= NEEDED_MASK;  	spin_unlock_irq(&bitmap->counts.lock);  } @@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap)  	if (!bitmap) /* there was no bitmap */  		return; +	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && +		bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) +		md_cluster_stop(bitmap->mddev); +  	/* Shouldn't be needed - but just in case.... */  	wait_event(bitmap->write_wait,  		   atomic_read(&bitmap->pending_writes) == 0); @@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev)   * initialize the bitmap structure   * if this returns an error, bitmap_destroy must be called to do clean up   */ -int bitmap_create(struct mddev *mddev) +struct bitmap *bitmap_create(struct mddev *mddev, int slot)  {  	struct bitmap *bitmap;  	sector_t blocks = mddev->resync_max_sectors; @@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev)  	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);  	if (!bitmap) -		return -ENOMEM; +		return ERR_PTR(-ENOMEM);  	spin_lock_init(&bitmap->counts.lock);  	atomic_set(&bitmap->pending_writes, 0); @@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev)  	init_waitqueue_head(&bitmap->behind_wait);  	bitmap->mddev = mddev; +	bitmap->cluster_slot = slot;  	if (mddev->kobj.sd)  		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); @@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev)  	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",  	       bitmap->counts.pages, bmname(bitmap)); -	mddev->bitmap = bitmap; -	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; +	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; +	if (err) +		goto error; +	return bitmap;   error:  	bitmap_free(bitmap); -	return err; +	return ERR_PTR(err);  }  int bitmap_load(struct mddev *mddev) @@ -1765,6 +1847,60 @@ out:  }  EXPORT_SYMBOL_GPL(bitmap_load); +/* Loads the bitmap associated with slot and copies the resync information + * to our bitmap + */ +int bitmap_copy_from_slot(struct mddev *mddev, int slot, +		sector_t *low, sector_t *high, bool clear_bits) +{ +	int rv = 0, i, j; +	sector_t block, lo = 0, hi = 0; +	struct bitmap_counts *counts; +	struct bitmap *bitmap = bitmap_create(mddev, slot); + +	if (IS_ERR(bitmap)) +		return PTR_ERR(bitmap); + +	rv = bitmap_read_sb(bitmap); +	if (rv) +		goto err; + +	rv = bitmap_init_from_disk(bitmap, 0); +	if (rv) +		goto err; + +	counts = &bitmap->counts; +	for (j = 0; j < counts->chunks; j++) { +		block = (sector_t)j << counts->chunkshift; +		if (bitmap_file_test_bit(bitmap, block)) { +			if (!lo) +				lo = block; +			hi = block; +			bitmap_file_clear_bit(bitmap, block); +			bitmap_set_memory_bits(mddev->bitmap, block, 1); +			bitmap_file_set_bit(mddev->bitmap, block); +		} +	} + +	if (clear_bits) { +		bitmap_update_sb(bitmap); +		/* Setting this for the ev_page should be enough. +		 * And we do not require both write_all and PAGE_DIRT either +		 */ +		for (i = 0; i < bitmap->storage.file_pages; i++) +			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); +		bitmap_write_all(bitmap); +		bitmap_unplug(bitmap); +	} +	*low = lo; +	*high = hi; +err: +	bitmap_free(bitmap); +	return rv; +} +EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); + +  void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)  {  	unsigned long chunk_kb; @@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,  	memset(&store, 0, sizeof(store));  	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)  		ret = bitmap_storage_alloc(&store, chunks, -					   !bitmap->mddev->bitmap_info.external); +					   !bitmap->mddev->bitmap_info.external, +					   bitmap->cluster_slot);  	if (ret)  		goto err; @@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)  				return -EINVAL;  			mddev->bitmap_info.offset = offset;  			if (mddev->pers) { +				struct bitmap *bitmap;  				mddev->pers->quiesce(mddev, 1); -				rv = bitmap_create(mddev); -				if (!rv) +				bitmap = bitmap_create(mddev, -1); +				if (IS_ERR(bitmap)) +					rv = PTR_ERR(bitmap); +				else { +					mddev->bitmap = bitmap;  					rv = bitmap_load(mddev); -				if (rv) { -					bitmap_destroy(mddev); -					mddev->bitmap_info.offset = 0; +					if (rv) { +						bitmap_destroy(mddev); +						mddev->bitmap_info.offset = 0; +					}  				}  				mddev->pers->quiesce(mddev, 0);  				if (rv) @@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);  static ssize_t metadata_show(struct mddev *mddev, char *page)  { +	if (mddev_is_clustered(mddev)) +		return sprintf(page, "clustered\n");  	return sprintf(page, "%s\n", (mddev->bitmap_info.external  				      ? "external" : "internal"));  } @@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)  		return -EBUSY;  	if (strncmp(buf, "external", 8) == 0)  		mddev->bitmap_info.external = 1; -	else if (strncmp(buf, "internal", 8) == 0) +	else if ((strncmp(buf, "internal", 8) == 0) || +			(strncmp(buf, "clustered", 9) == 0))  		mddev->bitmap_info.external = 0;  	else  		return -EINVAL; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 30210b9c4ef9..f1f4dd01090d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -130,8 +130,9 @@ typedef struct bitmap_super_s {  	__le32 write_behind; /* 60  number of outstanding write-behind writes */  	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are  				  * reserved for the bitmap. */ - -	__u8  pad[256 - 68]; /* set to zero */ +	__le32 nodes;        /* 68 the maximum number of nodes in cluster. */ +	__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ +	__u8  pad[256 - 136]; /* set to zero */  } bitmap_super_t;  /* notes: @@ -226,12 +227,13 @@ struct bitmap {  	wait_queue_head_t behind_wait;  	struct kernfs_node *sysfs_can_clear; +	int cluster_slot;		/* Slot offset for clustered env */  };  /* the bitmap API */  /* these are used only by md/bitmap */ -int  bitmap_create(struct mddev *mddev); +struct bitmap *bitmap_create(struct mddev *mddev, int slot);  int bitmap_load(struct mddev *mddev);  void bitmap_flush(struct mddev *mddev);  void bitmap_destroy(struct mddev *mddev); @@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);  int bitmap_resize(struct bitmap *bitmap, sector_t blocks,  		  int chunksize, int init); +int bitmap_copy_from_slot(struct mddev *mddev, int slot, +				sector_t *lo, sector_t *hi, bool clear_bits);  #endif  #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c new file mode 100644 index 000000000000..fcfc4b9b2672 --- /dev/null +++ b/drivers/md/md-cluster.c @@ -0,0 +1,965 @@ +/* + * Copyright (C) 2015, SUSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + */ + + +#include <linux/module.h> +#include <linux/dlm.h> +#include <linux/sched.h> +#include <linux/raid/md_p.h> +#include "md.h" +#include "bitmap.h" +#include "md-cluster.h" + +#define LVB_SIZE	64 +#define NEW_DEV_TIMEOUT 5000 + +struct dlm_lock_resource { +	dlm_lockspace_t *ls; +	struct dlm_lksb lksb; +	char *name; /* lock name. */ +	uint32_t flags; /* flags to pass to dlm_lock() */ +	struct completion completion; /* completion for synchronized locking */ +	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ +	struct mddev *mddev; /* pointing back to mddev. */ +}; + +struct suspend_info { +	int slot; +	sector_t lo; +	sector_t hi; +	struct list_head list; +}; + +struct resync_info { +	__le64 lo; +	__le64 hi; +}; + +/* md_cluster_info flags */ +#define		MD_CLUSTER_WAITING_FOR_NEWDISK		1 + + +struct md_cluster_info { +	/* dlm lock space and resources for clustered raid. */ +	dlm_lockspace_t *lockspace; +	int slot_number; +	struct completion completion; +	struct dlm_lock_resource *sb_lock; +	struct mutex sb_mutex; +	struct dlm_lock_resource *bitmap_lockres; +	struct list_head suspend_list; +	spinlock_t suspend_lock; +	struct md_thread *recovery_thread; +	unsigned long recovery_map; +	/* communication loc resources */ +	struct dlm_lock_resource *ack_lockres; +	struct dlm_lock_resource *message_lockres; +	struct dlm_lock_resource *token_lockres; +	struct dlm_lock_resource *no_new_dev_lockres; +	struct md_thread *recv_thread; +	struct completion newdisk_completion; +	unsigned long state; +}; + +enum msg_type { +	METADATA_UPDATED = 0, +	RESYNCING, +	NEWDISK, +	REMOVE, +	RE_ADD, +}; + +struct cluster_msg { +	int type; +	int slot; +	/* TODO: Unionize this for smaller footprint */ +	sector_t low; +	sector_t high; +	char uuid[16]; +	int raid_slot; +}; + +static void sync_ast(void *arg) +{ +	struct dlm_lock_resource *res; + +	res = (struct dlm_lock_resource *) arg; +	complete(&res->completion); +} + +static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) +{ +	int ret = 0; + +	init_completion(&res->completion); +	ret = dlm_lock(res->ls, mode, &res->lksb, +			res->flags, res->name, strlen(res->name), +			0, sync_ast, res, res->bast); +	if (ret) +		return ret; +	wait_for_completion(&res->completion); +	return res->lksb.sb_status; +} + +static int dlm_unlock_sync(struct dlm_lock_resource *res) +{ +	return dlm_lock_sync(res, DLM_LOCK_NL); +} + +static struct dlm_lock_resource *lockres_init(struct mddev *mddev, +		char *name, void (*bastfn)(void *arg, int mode), int with_lvb) +{ +	struct dlm_lock_resource *res = NULL; +	int ret, namelen; +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); +	if (!res) +		return NULL; +	res->ls = cinfo->lockspace; +	res->mddev = mddev; +	namelen = strlen(name); +	res->name = kzalloc(namelen + 1, GFP_KERNEL); +	if (!res->name) { +		pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); +		goto out_err; +	} +	strlcpy(res->name, name, namelen + 1); +	if (with_lvb) { +		res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); +		if (!res->lksb.sb_lvbptr) { +			pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); +			goto out_err; +		} +		res->flags = DLM_LKF_VALBLK; +	} + +	if (bastfn) +		res->bast = bastfn; + +	res->flags |= DLM_LKF_EXPEDITE; + +	ret = dlm_lock_sync(res, DLM_LOCK_NL); +	if (ret) { +		pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); +		goto out_err; +	} +	res->flags &= ~DLM_LKF_EXPEDITE; +	res->flags |= DLM_LKF_CONVERT; + +	return res; +out_err: +	kfree(res->lksb.sb_lvbptr); +	kfree(res->name); +	kfree(res); +	return NULL; +} + +static void lockres_free(struct dlm_lock_resource *res) +{ +	if (!res) +		return; + +	init_completion(&res->completion); +	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); +	wait_for_completion(&res->completion); + +	kfree(res->name); +	kfree(res->lksb.sb_lvbptr); +	kfree(res); +} + +static char *pretty_uuid(char *dest, char *src) +{ +	int i, len = 0; + +	for (i = 0; i < 16; i++) { +		if (i == 4 || i == 6 || i == 8 || i == 10) +			len += sprintf(dest + len, "-"); +		len += sprintf(dest + len, "%02x", (__u8)src[i]); +	} +	return dest; +} + +static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, +		sector_t lo, sector_t hi) +{ +	struct resync_info *ri; + +	ri = (struct resync_info *)lockres->lksb.sb_lvbptr; +	ri->lo = cpu_to_le64(lo); +	ri->hi = cpu_to_le64(hi); +} + +static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) +{ +	struct resync_info ri; +	struct suspend_info *s = NULL; +	sector_t hi = 0; + +	dlm_lock_sync(lockres, DLM_LOCK_CR); +	memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); +	hi = le64_to_cpu(ri.hi); +	if (ri.hi > 0) { +		s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); +		if (!s) +			goto out; +		s->hi = hi; +		s->lo = le64_to_cpu(ri.lo); +	} +	dlm_unlock_sync(lockres); +out: +	return s; +} + +static void recover_bitmaps(struct md_thread *thread) +{ +	struct mddev *mddev = thread->mddev; +	struct md_cluster_info *cinfo = mddev->cluster_info; +	struct dlm_lock_resource *bm_lockres; +	char str[64]; +	int slot, ret; +	struct suspend_info *s, *tmp; +	sector_t lo, hi; + +	while (cinfo->recovery_map) { +		slot = fls64((u64)cinfo->recovery_map) - 1; + +		/* Clear suspend_area associated with the bitmap */ +		spin_lock_irq(&cinfo->suspend_lock); +		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) +			if (slot == s->slot) { +				list_del(&s->list); +				kfree(s); +			} +		spin_unlock_irq(&cinfo->suspend_lock); + +		snprintf(str, 64, "bitmap%04d", slot); +		bm_lockres = lockres_init(mddev, str, NULL, 1); +		if (!bm_lockres) { +			pr_err("md-cluster: Cannot initialize bitmaps\n"); +			goto clear_bit; +		} + +		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); +		if (ret) { +			pr_err("md-cluster: Could not DLM lock %s: %d\n", +					str, ret); +			goto clear_bit; +		} +		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); +		if (ret) { +			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); +			goto dlm_unlock; +		} +		if (hi > 0) { +			/* TODO:Wait for current resync to get over */ +			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +			if (lo < mddev->recovery_cp) +				mddev->recovery_cp = lo; +			md_check_recovery(mddev); +		} +dlm_unlock: +		dlm_unlock_sync(bm_lockres); +clear_bit: +		clear_bit(slot, &cinfo->recovery_map); +	} +} + +static void recover_prep(void *arg) +{ +} + +static void recover_slot(void *arg, struct dlm_slot *slot) +{ +	struct mddev *mddev = arg; +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", +			mddev->bitmap_info.cluster_name, +			slot->nodeid, slot->slot, +			cinfo->slot_number); +	set_bit(slot->slot - 1, &cinfo->recovery_map); +	if (!cinfo->recovery_thread) { +		cinfo->recovery_thread = md_register_thread(recover_bitmaps, +				mddev, "recover"); +		if (!cinfo->recovery_thread) { +			pr_warn("md-cluster: Could not create recovery thread\n"); +			return; +		} +	} +	md_wakeup_thread(cinfo->recovery_thread); +} + +static void recover_done(void *arg, struct dlm_slot *slots, +		int num_slots, int our_slot, +		uint32_t generation) +{ +	struct mddev *mddev = arg; +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	cinfo->slot_number = our_slot; +	complete(&cinfo->completion); +} + +static const struct dlm_lockspace_ops md_ls_ops = { +	.recover_prep = recover_prep, +	.recover_slot = recover_slot, +	.recover_done = recover_done, +}; + +/* + * The BAST function for the ack lock resource + * This function wakes up the receive thread in + * order to receive and process the message. + */ +static void ack_bast(void *arg, int mode) +{ +	struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; +	struct md_cluster_info *cinfo = res->mddev->cluster_info; + +	if (mode == DLM_LOCK_EX) +		md_wakeup_thread(cinfo->recv_thread); +} + +static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) +{ +	struct suspend_info *s, *tmp; + +	list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) +		if (slot == s->slot) { +			pr_info("%s:%d Deleting suspend_info: %d\n", +					__func__, __LINE__, slot); +			list_del(&s->list); +			kfree(s); +			break; +		} +} + +static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) +{ +	spin_lock_irq(&cinfo->suspend_lock); +	__remove_suspend_info(cinfo, slot); +	spin_unlock_irq(&cinfo->suspend_lock); +} + + +static void process_suspend_info(struct md_cluster_info *cinfo, +		int slot, sector_t lo, sector_t hi) +{ +	struct suspend_info *s; + +	if (!hi) { +		remove_suspend_info(cinfo, slot); +		return; +	} +	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); +	if (!s) +		return; +	s->slot = slot; +	s->lo = lo; +	s->hi = hi; +	spin_lock_irq(&cinfo->suspend_lock); +	/* Remove existing entry (if exists) before adding */ +	__remove_suspend_info(cinfo, slot); +	list_add(&s->list, &cinfo->suspend_list); +	spin_unlock_irq(&cinfo->suspend_lock); +} + +static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +{ +	char disk_uuid[64]; +	struct md_cluster_info *cinfo = mddev->cluster_info; +	char event_name[] = "EVENT=ADD_DEVICE"; +	char raid_slot[16]; +	char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; +	int len; + +	len = snprintf(disk_uuid, 64, "DEVICE_UUID="); +	pretty_uuid(disk_uuid + len, cmsg->uuid); +	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); +	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); +	init_completion(&cinfo->newdisk_completion); +	set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); +	kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); +	wait_for_completion_timeout(&cinfo->newdisk_completion, +			NEW_DEV_TIMEOUT); +	clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); +} + + +static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	md_reload_sb(mddev); +	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); +} + +static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) +{ +	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + +	if (rdev) +		md_kick_rdev_from_array(rdev); +	else +		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); +} + +static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) +{ +	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + +	if (rdev && test_bit(Faulty, &rdev->flags)) +		clear_bit(Faulty, &rdev->flags); +	else +		pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); +} + +static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) +{ +	switch (msg->type) { +	case METADATA_UPDATED: +		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", +			__func__, __LINE__, msg->slot); +		process_metadata_update(mddev, msg); +		break; +	case RESYNCING: +		pr_info("%s: %d Received message: RESYNCING from %d\n", +			__func__, __LINE__, msg->slot); +		process_suspend_info(mddev->cluster_info, msg->slot, +				msg->low, msg->high); +		break; +	case NEWDISK: +		pr_info("%s: %d Received message: NEWDISK from %d\n", +			__func__, __LINE__, msg->slot); +		process_add_new_disk(mddev, msg); +		break; +	case REMOVE: +		pr_info("%s: %d Received REMOVE from %d\n", +			__func__, __LINE__, msg->slot); +		process_remove_disk(mddev, msg); +		break; +	case RE_ADD: +		pr_info("%s: %d Received RE_ADD from %d\n", +			__func__, __LINE__, msg->slot); +		process_readd_disk(mddev, msg); +		break; +	default: +		pr_warn("%s:%d Received unknown message from %d\n", +			__func__, __LINE__, msg->slot); +	} +} + +/* + * thread for receiving message + */ +static void recv_daemon(struct md_thread *thread) +{ +	struct md_cluster_info *cinfo = thread->mddev->cluster_info; +	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; +	struct dlm_lock_resource *message_lockres = cinfo->message_lockres; +	struct cluster_msg msg; + +	/*get CR on Message*/ +	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { +		pr_err("md/raid1:failed to get CR on MESSAGE\n"); +		return; +	} + +	/* read lvb and wake up thread to process this message_lockres */ +	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); +	process_recvd_msg(thread->mddev, &msg); + +	/*release CR on ack_lockres*/ +	dlm_unlock_sync(ack_lockres); +	/*up-convert to EX on message_lockres*/ +	dlm_lock_sync(message_lockres, DLM_LOCK_EX); +	/*get CR on ack_lockres again*/ +	dlm_lock_sync(ack_lockres, DLM_LOCK_CR); +	/*release CR on message_lockres*/ +	dlm_unlock_sync(message_lockres); +} + +/* lock_comm() + * Takes the lock on the TOKEN lock resource so no other + * node can communicate while the operation is underway. + */ +static int lock_comm(struct md_cluster_info *cinfo) +{ +	int error; + +	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); +	if (error) +		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", +				__func__, __LINE__, error); +	return error; +} + +static void unlock_comm(struct md_cluster_info *cinfo) +{ +	dlm_unlock_sync(cinfo->token_lockres); +} + +/* __sendmsg() + * This function performs the actual sending of the message. This function is + * usually called after performing the encompassing operation + * The function: + * 1. Grabs the message lockresource in EX mode + * 2. Copies the message to the message LVB + * 3. Downconverts message lockresource to CR + * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes + *    and the other nodes read the message. The thread will wait here until all other + *    nodes have released ack lock resource. + * 5. Downconvert ack lockresource to CR + */ +static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) +{ +	int error; +	int slot = cinfo->slot_number - 1; + +	cmsg->slot = cpu_to_le32(slot); +	/*get EX on Message*/ +	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); +	if (error) { +		pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); +		goto failed_message; +	} + +	memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, +			sizeof(struct cluster_msg)); +	/*down-convert EX to CR on Message*/ +	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); +	if (error) { +		pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", +				error); +		goto failed_message; +	} + +	/*up-convert CR to EX on Ack*/ +	error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); +	if (error) { +		pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", +				error); +		goto failed_ack; +	} + +	/*down-convert EX to CR on Ack*/ +	error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); +	if (error) { +		pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", +				error); +		goto failed_ack; +	} + +failed_ack: +	dlm_unlock_sync(cinfo->message_lockres); +failed_message: +	return error; +} + +static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) +{ +	int ret; + +	lock_comm(cinfo); +	ret = __sendmsg(cinfo, cmsg); +	unlock_comm(cinfo); +	return ret; +} + +static int gather_all_resync_info(struct mddev *mddev, int total_slots) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; +	int i, ret = 0; +	struct dlm_lock_resource *bm_lockres; +	struct suspend_info *s; +	char str[64]; + + +	for (i = 0; i < total_slots; i++) { +		memset(str, '\0', 64); +		snprintf(str, 64, "bitmap%04d", i); +		bm_lockres = lockres_init(mddev, str, NULL, 1); +		if (!bm_lockres) +			return -ENOMEM; +		if (i == (cinfo->slot_number - 1)) +			continue; + +		bm_lockres->flags |= DLM_LKF_NOQUEUE; +		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); +		if (ret == -EAGAIN) { +			memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); +			s = read_resync_info(mddev, bm_lockres); +			if (s) { +				pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", +						__func__, __LINE__, +						(unsigned long long) s->lo, +						(unsigned long long) s->hi, i); +				spin_lock_irq(&cinfo->suspend_lock); +				s->slot = i; +				list_add(&s->list, &cinfo->suspend_list); +				spin_unlock_irq(&cinfo->suspend_lock); +			} +			ret = 0; +			lockres_free(bm_lockres); +			continue; +		} +		if (ret) +			goto out; +		/* TODO: Read the disk bitmap sb and check if it needs recovery */ +		dlm_unlock_sync(bm_lockres); +		lockres_free(bm_lockres); +	} +out: +	return ret; +} + +static int join(struct mddev *mddev, int nodes) +{ +	struct md_cluster_info *cinfo; +	int ret, ops_rv; +	char str[64]; + +	if (!try_module_get(THIS_MODULE)) +		return -ENOENT; + +	cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); +	if (!cinfo) +		return -ENOMEM; + +	init_completion(&cinfo->completion); + +	mutex_init(&cinfo->sb_mutex); +	mddev->cluster_info = cinfo; + +	memset(str, 0, 64); +	pretty_uuid(str, mddev->uuid); +	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, +				DLM_LSFL_FS, LVB_SIZE, +				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); +	if (ret) +		goto err; +	wait_for_completion(&cinfo->completion); +	if (nodes < cinfo->slot_number) { +		pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", +			cinfo->slot_number, nodes); +		ret = -ERANGE; +		goto err; +	} +	cinfo->sb_lock = lockres_init(mddev, "cmd-super", +					NULL, 0); +	if (!cinfo->sb_lock) { +		ret = -ENOMEM; +		goto err; +	} +	/* Initiate the communication resources */ +	ret = -ENOMEM; +	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); +	if (!cinfo->recv_thread) { +		pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); +		goto err; +	} +	cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); +	if (!cinfo->message_lockres) +		goto err; +	cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); +	if (!cinfo->token_lockres) +		goto err; +	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); +	if (!cinfo->ack_lockres) +		goto err; +	cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); +	if (!cinfo->no_new_dev_lockres) +		goto err; + +	/* get sync CR lock on ACK. */ +	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) +		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", +				ret); +	/* get sync CR lock on no-new-dev. */ +	if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) +		pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); + + +	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); +	snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); +	cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); +	if (!cinfo->bitmap_lockres) +		goto err; +	if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { +		pr_err("Failed to get bitmap lock\n"); +		ret = -EINVAL; +		goto err; +	} + +	INIT_LIST_HEAD(&cinfo->suspend_list); +	spin_lock_init(&cinfo->suspend_lock); + +	ret = gather_all_resync_info(mddev, nodes); +	if (ret) +		goto err; + +	return 0; +err: +	lockres_free(cinfo->message_lockres); +	lockres_free(cinfo->token_lockres); +	lockres_free(cinfo->ack_lockres); +	lockres_free(cinfo->no_new_dev_lockres); +	lockres_free(cinfo->bitmap_lockres); +	lockres_free(cinfo->sb_lock); +	if (cinfo->lockspace) +		dlm_release_lockspace(cinfo->lockspace, 2); +	mddev->cluster_info = NULL; +	kfree(cinfo); +	module_put(THIS_MODULE); +	return ret; +} + +static int leave(struct mddev *mddev) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	if (!cinfo) +		return 0; +	md_unregister_thread(&cinfo->recovery_thread); +	md_unregister_thread(&cinfo->recv_thread); +	lockres_free(cinfo->message_lockres); +	lockres_free(cinfo->token_lockres); +	lockres_free(cinfo->ack_lockres); +	lockres_free(cinfo->no_new_dev_lockres); +	lockres_free(cinfo->sb_lock); +	lockres_free(cinfo->bitmap_lockres); +	dlm_release_lockspace(cinfo->lockspace, 2); +	return 0; +} + +/* slot_number(): Returns the MD slot number to use + * DLM starts the slot numbers from 1, wheras cluster-md + * wants the number to be from zero, so we deduct one + */ +static int slot_number(struct mddev *mddev) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	return cinfo->slot_number - 1; +} + +static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); +	/* Re-acquire the lock to refresh LVB */ +	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); +} + +static int metadata_update_start(struct mddev *mddev) +{ +	return lock_comm(mddev->cluster_info); +} + +static int metadata_update_finish(struct mddev *mddev) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; +	struct cluster_msg cmsg; +	int ret; + +	memset(&cmsg, 0, sizeof(cmsg)); +	cmsg.type = cpu_to_le32(METADATA_UPDATED); +	ret = __sendmsg(cinfo, &cmsg); +	unlock_comm(cinfo); +	return ret; +} + +static int metadata_update_cancel(struct mddev *mddev) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	return dlm_unlock_sync(cinfo->token_lockres); +} + +static int resync_send(struct mddev *mddev, enum msg_type type, +		sector_t lo, sector_t hi) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; +	struct cluster_msg cmsg; +	int slot = cinfo->slot_number - 1; + +	pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, +			(unsigned long long)lo, +			(unsigned long long)hi); +	resync_info_update(mddev, lo, hi); +	cmsg.type = cpu_to_le32(type); +	cmsg.slot = cpu_to_le32(slot); +	cmsg.low = cpu_to_le64(lo); +	cmsg.high = cpu_to_le64(hi); +	return sendmsg(cinfo, &cmsg); +} + +static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) +{ +	pr_info("%s:%d\n", __func__, __LINE__); +	return resync_send(mddev, RESYNCING, lo, hi); +} + +static void resync_finish(struct mddev *mddev) +{ +	pr_info("%s:%d\n", __func__, __LINE__); +	resync_send(mddev, RESYNCING, 0, 0); +} + +static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; +	int ret = 0; +	struct suspend_info *s; + +	spin_lock_irq(&cinfo->suspend_lock); +	if (list_empty(&cinfo->suspend_list)) +		goto out; +	list_for_each_entry(s, &cinfo->suspend_list, list) +		if (hi > s->lo && lo < s->hi) { +			ret = 1; +			break; +		} +out: +	spin_unlock_irq(&cinfo->suspend_lock); +	return ret; +} + +static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; +	struct cluster_msg cmsg; +	int ret = 0; +	struct mdp_superblock_1 *sb = page_address(rdev->sb_page); +	char *uuid = sb->device_uuid; + +	memset(&cmsg, 0, sizeof(cmsg)); +	cmsg.type = cpu_to_le32(NEWDISK); +	memcpy(cmsg.uuid, uuid, 16); +	cmsg.raid_slot = rdev->desc_nr; +	lock_comm(cinfo); +	ret = __sendmsg(cinfo, &cmsg); +	if (ret) +		return ret; +	cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; +	ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); +	cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; +	/* Some node does not "see" the device */ +	if (ret == -EAGAIN) +		ret = -ENOENT; +	else +		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); +	return ret; +} + +static int add_new_disk_finish(struct mddev *mddev) +{ +	struct cluster_msg cmsg; +	struct md_cluster_info *cinfo = mddev->cluster_info; +	int ret; +	/* Write sb and inform others */ +	md_update_sb(mddev, 1); +	cmsg.type = METADATA_UPDATED; +	ret = __sendmsg(cinfo, &cmsg); +	unlock_comm(cinfo); +	return ret; +} + +static int new_disk_ack(struct mddev *mddev, bool ack) +{ +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { +		pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); +		return -EINVAL; +	} + +	if (ack) +		dlm_unlock_sync(cinfo->no_new_dev_lockres); +	complete(&cinfo->newdisk_completion); +	return 0; +} + +static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ +	struct cluster_msg cmsg; +	struct md_cluster_info *cinfo = mddev->cluster_info; +	cmsg.type = REMOVE; +	cmsg.raid_slot = rdev->desc_nr; +	return __sendmsg(cinfo, &cmsg); +} + +static int gather_bitmaps(struct md_rdev *rdev) +{ +	int sn, err; +	sector_t lo, hi; +	struct cluster_msg cmsg; +	struct mddev *mddev = rdev->mddev; +	struct md_cluster_info *cinfo = mddev->cluster_info; + +	cmsg.type = RE_ADD; +	cmsg.raid_slot = rdev->desc_nr; +	err = sendmsg(cinfo, &cmsg); +	if (err) +		goto out; + +	for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { +		if (sn == (cinfo->slot_number - 1)) +			continue; +		err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); +		if (err) { +			pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); +			goto out; +		} +		if ((hi > 0) && (lo < mddev->recovery_cp)) +			mddev->recovery_cp = lo; +	} +out: +	return err; +} + +static struct md_cluster_operations cluster_ops = { +	.join   = join, +	.leave  = leave, +	.slot_number = slot_number, +	.resync_info_update = resync_info_update, +	.resync_start = resync_start, +	.resync_finish = resync_finish, +	.metadata_update_start = metadata_update_start, +	.metadata_update_finish = metadata_update_finish, +	.metadata_update_cancel = metadata_update_cancel, +	.area_resyncing = area_resyncing, +	.add_new_disk_start = add_new_disk_start, +	.add_new_disk_finish = add_new_disk_finish, +	.new_disk_ack = new_disk_ack, +	.remove_disk = remove_disk, +	.gather_bitmaps = gather_bitmaps, +}; + +static int __init cluster_init(void) +{ +	pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); +	pr_info("Registering Cluster MD functions\n"); +	register_md_cluster_operations(&cluster_ops, THIS_MODULE); +	return 0; +} + +static void cluster_exit(void) +{ +	unregister_md_cluster_operations(); +} + +module_init(cluster_init); +module_exit(cluster_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h new file mode 100644 index 000000000000..6817ee00e053 --- /dev/null +++ b/drivers/md/md-cluster.h @@ -0,0 +1,29 @@ + + +#ifndef _MD_CLUSTER_H +#define _MD_CLUSTER_H + +#include "md.h" + +struct mddev; +struct md_rdev; + +struct md_cluster_operations { +	int (*join)(struct mddev *mddev, int nodes); +	int (*leave)(struct mddev *mddev); +	int (*slot_number)(struct mddev *mddev); +	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); +	int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); +	void (*resync_finish)(struct mddev *mddev); +	int (*metadata_update_start)(struct mddev *mddev); +	int (*metadata_update_finish)(struct mddev *mddev); +	int (*metadata_update_cancel)(struct mddev *mddev); +	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); +	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); +	int (*add_new_disk_finish)(struct mddev *mddev); +	int (*new_disk_ack)(struct mddev *mddev, bool ack); +	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); +	int (*gather_bitmaps)(struct md_rdev *rdev); +}; + +#endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index e6178787ce3d..0d8968535976 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -53,6 +53,7 @@  #include <linux/slab.h>  #include "md.h"  #include "bitmap.h" +#include "md-cluster.h"  #ifndef MODULE  static void autostart_arrays(int part); @@ -66,6 +67,11 @@ static void autostart_arrays(int part);  static LIST_HEAD(pers_list);  static DEFINE_SPINLOCK(pers_lock); +struct md_cluster_operations *md_cluster_ops; +EXPORT_SYMBOL(md_cluster_ops); +struct module *md_cluster_mod; +EXPORT_SYMBOL(md_cluster_mod); +  static DECLARE_WAIT_QUEUE_HEAD(resync_wait);  static struct workqueue_struct *md_wq;  static struct workqueue_struct *md_misc_wq; @@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev)  }  EXPORT_SYMBOL_GPL(mddev_unlock); -static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)  {  	struct md_rdev *rdev; @@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)  	return NULL;  } +EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);  static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)  { @@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)  		int choice = 0;  		if (mddev->pers)  			choice = mddev->raid_disks; -		while (find_rdev_nr_rcu(mddev, choice)) +		while (md_find_rdev_nr_rcu(mddev, choice))  			choice++;  		rdev->desc_nr = choice;  	} else { -		if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { +		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {  			rcu_read_unlock();  			return -EBUSY;  		} @@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev)  	kobject_put(&rdev->kobj);  } -static void kick_rdev_from_array(struct md_rdev *rdev) +void md_kick_rdev_from_array(struct md_rdev *rdev)  {  	unbind_rdev_from_array(rdev);  	export_rdev(rdev);  } +EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);  static void export_array(struct mddev *mddev)  { @@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev)  	while (!list_empty(&mddev->disks)) {  		rdev = list_first_entry(&mddev->disks, struct md_rdev,  					same_set); -		kick_rdev_from_array(rdev); +		md_kick_rdev_from_array(rdev);  	}  	mddev->raid_disks = 0;  	mddev->major_version = 0; @@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares)  	}  } -static void md_update_sb(struct mddev *mddev, int force_change) +void md_update_sb(struct mddev *mddev, int force_change)  {  	struct md_rdev *rdev;  	int sync_req; @@ -2369,6 +2377,37 @@ repeat:  		wake_up(&rdev->blocked_wait);  	}  } +EXPORT_SYMBOL(md_update_sb); + +static int add_bound_rdev(struct md_rdev *rdev) +{ +	struct mddev *mddev = rdev->mddev; +	int err = 0; + +	if (!mddev->pers->hot_remove_disk) { +		/* If there is hot_add_disk but no hot_remove_disk +		 * then added disks for geometry changes, +		 * and should be added immediately. +		 */ +		super_types[mddev->major_version]. +			validate_super(mddev, rdev); +		err = mddev->pers->hot_add_disk(mddev, rdev); +		if (err) { +			unbind_rdev_from_array(rdev); +			export_rdev(rdev); +			return err; +		} +	} +	sysfs_notify_dirent_safe(rdev->sysfs_state); + +	set_bit(MD_CHANGE_DEVS, &mddev->flags); +	if (mddev->degraded) +		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +	md_new_event(mddev); +	md_wakeup_thread(mddev->thread); +	return 0; +}  /* words written to sysfs files may, or may not, be \n terminated.   * We want to accept with case. For this we use cmd_match. @@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)  			err = -EBUSY;  		else {  			struct mddev *mddev = rdev->mddev; -			kick_rdev_from_array(rdev); +			if (mddev_is_clustered(mddev)) +				md_cluster_ops->remove_disk(mddev, rdev); +			md_kick_rdev_from_array(rdev); +			if (mddev_is_clustered(mddev)) +				md_cluster_ops->metadata_update_start(mddev);  			if (mddev->pers)  				md_update_sb(mddev, 1);  			md_new_event(mddev); +			if (mddev_is_clustered(mddev)) +				md_cluster_ops->metadata_update_finish(mddev);  			err = 0;  		}  	} else if (cmd_match(buf, "writemostly")) { @@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)  			clear_bit(Replacement, &rdev->flags);  			err = 0;  		} +	} else if (cmd_match(buf, "re-add")) { +		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { +			/* clear_bit is performed _after_ all the devices +			 * have their local Faulty bit cleared. If any writes +			 * happen in the meantime in the local node, they +			 * will land in the local bitmap, which will be synced +			 * by this node eventually +			 */ +			if (!mddev_is_clustered(rdev->mddev) || +			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { +				clear_bit(Faulty, &rdev->flags); +				err = add_bound_rdev(rdev); +			} +		} else +			err = -EBUSY;  	}  	if (!err)  		sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev)  				"md: fatal superblock inconsistency in %s"  				" -- removing from array\n",  				bdevname(rdev->bdev,b)); -			kick_rdev_from_array(rdev); +			md_kick_rdev_from_array(rdev);  		}  	super_types[mddev->major_version]. @@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev)  			       "md: %s: %s: only %d devices permitted\n",  			       mdname(mddev), bdevname(rdev->bdev, b),  			       mddev->max_disks); -			kick_rdev_from_array(rdev); +			md_kick_rdev_from_array(rdev);  			continue;  		} -		if (rdev != freshest) +		if (rdev != freshest) {  			if (super_types[mddev->major_version].  			    validate_super(mddev, rdev)) {  				printk(KERN_WARNING "md: kicking non-fresh %s"  					" from array!\n",  					bdevname(rdev->bdev,b)); -				kick_rdev_from_array(rdev); +				md_kick_rdev_from_array(rdev);  				continue;  			} +			/* No device should have a Candidate flag +			 * when reading devices +			 */ +			if (test_bit(Candidate, &rdev->flags)) { +				pr_info("md: kicking Cluster Candidate %s from array!\n", +					bdevname(rdev->bdev, b)); +				md_kick_rdev_from_array(rdev); +			} +		}  		if (mddev->level == LEVEL_MULTIPATH) {  			rdev->desc_nr = i++;  			rdev->raid_disk = rdev->desc_nr; @@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len)  	if (err)  		return err;  	if (mddev->pers) { +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->metadata_update_start(mddev);  		err = update_size(mddev, sectors);  		md_update_sb(mddev, 1); +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->metadata_update_finish(mddev);  	} else {  		if (mddev->dev_sectors == 0 ||  		    mddev->dev_sectors > sectors) @@ -5077,10 +5150,16 @@ int md_run(struct mddev *mddev)  	}  	if (err == 0 && pers->sync_request &&  	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { -		err = bitmap_create(mddev); -		if (err) +		struct bitmap *bitmap; + +		bitmap = bitmap_create(mddev, -1); +		if (IS_ERR(bitmap)) { +			err = PTR_ERR(bitmap);  			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",  			       mdname(mddev), err); +		} else +			mddev->bitmap = bitmap; +  	}  	if (err) {  		mddev_detach(mddev); @@ -5232,6 +5311,8 @@ static void md_clean(struct mddev *mddev)  static void __md_stop_writes(struct mddev *mddev)  { +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_start(mddev);  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  	flush_workqueue(md_misc_wq);  	if (mddev->sync_thread) { @@ -5250,6 +5331,8 @@ static void __md_stop_writes(struct mddev *mddev)  		mddev->in_sync = 1;  		md_update_sb(mddev, 1);  	} +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_finish(mddev);  }  void md_stop_writes(struct mddev *mddev) @@ -5636,6 +5719,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg)  		info.state = (1<<MD_SB_CLEAN);  	if (mddev->bitmap && mddev->bitmap_info.offset)  		info.state |= (1<<MD_SB_BITMAP_PRESENT); +	if (mddev_is_clustered(mddev)) +		info.state |= (1<<MD_SB_CLUSTERED);  	info.active_disks  = insync;  	info.working_disks = working;  	info.failed_disks  = failed; @@ -5691,7 +5776,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)  		return -EFAULT;  	rcu_read_lock(); -	rdev = find_rdev_nr_rcu(mddev, info.number); +	rdev = md_find_rdev_nr_rcu(mddev, info.number);  	if (rdev) {  		info.major = MAJOR(rdev->bdev->bd_dev);  		info.minor = MINOR(rdev->bdev->bd_dev); @@ -5724,6 +5809,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)  	struct md_rdev *rdev;  	dev_t dev = MKDEV(info->major,info->minor); +	if (mddev_is_clustered(mddev) && +		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { +		pr_err("%s: Cannot add to clustered mddev.\n", +			       mdname(mddev)); +		return -EINVAL; +	} +  	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))  		return -EOVERFLOW; @@ -5810,31 +5902,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)  		else  			clear_bit(WriteMostly, &rdev->flags); +		/* +		 * check whether the device shows up in other nodes +		 */ +		if (mddev_is_clustered(mddev)) { +			if (info->state & (1 << MD_DISK_CANDIDATE)) { +				/* Through --cluster-confirm */ +				set_bit(Candidate, &rdev->flags); +				err = md_cluster_ops->new_disk_ack(mddev, true); +				if (err) { +					export_rdev(rdev); +					return err; +				} +			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { +				/* --add initiated by this node */ +				err = md_cluster_ops->add_new_disk_start(mddev, rdev); +				if (err) { +					md_cluster_ops->add_new_disk_finish(mddev); +					export_rdev(rdev); +					return err; +				} +			} +		} +  		rdev->raid_disk = -1;  		err = bind_rdev_to_array(rdev, mddev); -		if (!err && !mddev->pers->hot_remove_disk) { -			/* If there is hot_add_disk but no hot_remove_disk -			 * then added disks for geometry changes, -			 * and should be added immediately. -			 */ -			super_types[mddev->major_version]. -				validate_super(mddev, rdev); -			err = mddev->pers->hot_add_disk(mddev, rdev); -			if (err) -				unbind_rdev_from_array(rdev); -		}  		if (err)  			export_rdev(rdev);  		else -			sysfs_notify_dirent_safe(rdev->sysfs_state); - -		set_bit(MD_CHANGE_DEVS, &mddev->flags); -		if (mddev->degraded) -			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); -		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -		if (!err) -			md_new_event(mddev); -		md_wakeup_thread(mddev->thread); +			err = add_bound_rdev(rdev); +		if (mddev_is_clustered(mddev) && +				(info->state & (1 << MD_DISK_CLUSTER_ADD))) +			md_cluster_ops->add_new_disk_finish(mddev);  		return err;  	} @@ -5895,18 +5994,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)  	if (!rdev)  		return -ENXIO; +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_start(mddev); +  	clear_bit(Blocked, &rdev->flags);  	remove_and_add_spares(mddev, rdev);  	if (rdev->raid_disk >= 0)  		goto busy; -	kick_rdev_from_array(rdev); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->remove_disk(mddev, rdev); + +	md_kick_rdev_from_array(rdev);  	md_update_sb(mddev, 1);  	md_new_event(mddev); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_finish(mddev); +  	return 0;  busy: +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_cancel(mddev);  	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",  		bdevname(rdev->bdev,b), mdname(mddev));  	return -EBUSY; @@ -5956,12 +6066,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)  		err = -EINVAL;  		goto abort_export;  	} + +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_start(mddev);  	clear_bit(In_sync, &rdev->flags);  	rdev->desc_nr = -1;  	rdev->saved_raid_disk = -1;  	err = bind_rdev_to_array(rdev, mddev);  	if (err) -		goto abort_export; +		goto abort_clustered;  	/*  	 * The rest should better be atomic, we can have disk failures @@ -5972,6 +6085,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)  	md_update_sb(mddev, 1); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_finish(mddev);  	/*  	 * Kick recovery, maybe this spare has to be added to the  	 * array immediately. @@ -5981,6 +6096,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)  	md_new_event(mddev);  	return 0; +abort_clustered: +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_cancel(mddev);  abort_export:  	export_rdev(rdev);  	return err; @@ -6038,9 +6156,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd)  	if (mddev->pers) {  		mddev->pers->quiesce(mddev, 1);  		if (fd >= 0) { -			err = bitmap_create(mddev); -			if (!err) +			struct bitmap *bitmap; + +			bitmap = bitmap_create(mddev, -1); +			if (!IS_ERR(bitmap)) { +				mddev->bitmap = bitmap;  				err = bitmap_load(mddev); +			} else +				err = PTR_ERR(bitmap);  		}  		if (fd < 0 || err) {  			bitmap_destroy(mddev); @@ -6293,6 +6416,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)  			return rv;  		}  	} +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_start(mddev);  	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)  		rv = update_size(mddev, (sector_t)info->size * 2); @@ -6300,33 +6425,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)  		rv = update_raid_disks(mddev, info->raid_disks);  	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { -		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) -			return -EINVAL; -		if (mddev->recovery || mddev->sync_thread) -			return -EBUSY; +		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { +			rv = -EINVAL; +			goto err; +		} +		if (mddev->recovery || mddev->sync_thread) { +			rv = -EBUSY; +			goto err; +		}  		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { +			struct bitmap *bitmap;  			/* add the bitmap */ -			if (mddev->bitmap) -				return -EEXIST; -			if (mddev->bitmap_info.default_offset == 0) -				return -EINVAL; +			if (mddev->bitmap) { +				rv = -EEXIST; +				goto err; +			} +			if (mddev->bitmap_info.default_offset == 0) { +				rv = -EINVAL; +				goto err; +			}  			mddev->bitmap_info.offset =  				mddev->bitmap_info.default_offset;  			mddev->bitmap_info.space =  				mddev->bitmap_info.default_space;  			mddev->pers->quiesce(mddev, 1); -			rv = bitmap_create(mddev); -			if (!rv) +			bitmap = bitmap_create(mddev, -1); +			if (!IS_ERR(bitmap)) { +				mddev->bitmap = bitmap;  				rv = bitmap_load(mddev); +			} else +				rv = PTR_ERR(bitmap);  			if (rv)  				bitmap_destroy(mddev);  			mddev->pers->quiesce(mddev, 0);  		} else {  			/* remove the bitmap */ -			if (!mddev->bitmap) -				return -ENOENT; -			if (mddev->bitmap->storage.file) -				return -EINVAL; +			if (!mddev->bitmap) { +				rv = -ENOENT; +				goto err; +			} +			if (mddev->bitmap->storage.file) { +				rv = -EINVAL; +				goto err; +			}  			mddev->pers->quiesce(mddev, 1);  			bitmap_destroy(mddev);  			mddev->pers->quiesce(mddev, 0); @@ -6334,6 +6475,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)  		}  	}  	md_update_sb(mddev, 1); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_finish(mddev); +	return rv; +err: +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_cancel(mddev);  	return rv;  } @@ -6393,6 +6540,7 @@ static inline bool md_ioctl_valid(unsigned int cmd)  	case SET_DISK_FAULTY:  	case STOP_ARRAY:  	case STOP_ARRAY_RO: +	case CLUSTERED_DISK_NACK:  		return true;  	default:  		return false; @@ -6665,6 +6813,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,  		goto unlock;  	} +	case CLUSTERED_DISK_NACK: +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->new_disk_ack(mddev, false); +		else +			err = -EINVAL; +		goto unlock; +  	case HOT_ADD_DISK:  		err = hot_add_disk(mddev, new_decode_dev(arg));  		goto unlock; @@ -7238,6 +7393,55 @@ int unregister_md_personality(struct md_personality *p)  }  EXPORT_SYMBOL(unregister_md_personality); +int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) +{ +	if (md_cluster_ops != NULL) +		return -EALREADY; +	spin_lock(&pers_lock); +	md_cluster_ops = ops; +	md_cluster_mod = module; +	spin_unlock(&pers_lock); +	return 0; +} +EXPORT_SYMBOL(register_md_cluster_operations); + +int unregister_md_cluster_operations(void) +{ +	spin_lock(&pers_lock); +	md_cluster_ops = NULL; +	spin_unlock(&pers_lock); +	return 0; +} +EXPORT_SYMBOL(unregister_md_cluster_operations); + +int md_setup_cluster(struct mddev *mddev, int nodes) +{ +	int err; + +	err = request_module("md-cluster"); +	if (err) { +		pr_err("md-cluster module not found.\n"); +		return err; +	} + +	spin_lock(&pers_lock); +	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { +		spin_unlock(&pers_lock); +		return -ENOENT; +	} +	spin_unlock(&pers_lock); + +	return md_cluster_ops->join(mddev, nodes); +} + +void md_cluster_stop(struct mddev *mddev) +{ +	if (!md_cluster_ops) +		return; +	md_cluster_ops->leave(mddev); +	module_put(md_cluster_mod); +} +  static int is_mddev_idle(struct mddev *mddev, int init)  {  	struct md_rdev *rdev; @@ -7375,7 +7579,11 @@ int md_allow_write(struct mddev *mddev)  		    mddev->safemode == 0)  			mddev->safemode = 1;  		spin_unlock(&mddev->lock); +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->metadata_update_start(mddev);  		md_update_sb(mddev, 0); +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->metadata_update_finish(mddev);  		sysfs_notify_dirent_safe(mddev->sysfs_state);  	} else  		spin_unlock(&mddev->lock); @@ -7576,6 +7784,9 @@ void md_do_sync(struct md_thread *thread)  	md_new_event(mddev);  	update_time = jiffies; +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->resync_start(mddev, j, max_sectors); +  	blk_start_plug(&plug);  	while (j < max_sectors) {  		sector_t sectors; @@ -7636,6 +7847,8 @@ void md_do_sync(struct md_thread *thread)  		j += sectors;  		if (j > 2)  			mddev->curr_resync = j; +		if (mddev_is_clustered(mddev)) +			md_cluster_ops->resync_info_update(mddev, j, max_sectors);  		mddev->curr_mark_cnt = io_sectors;  		if (last_check == 0)  			/* this is the earliest that rebuild will be @@ -7696,6 +7909,9 @@ void md_do_sync(struct md_thread *thread)  	/* tell personality that we are finished */  	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->resync_finish(mddev); +  	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&  	    mddev->curr_resync > 2) {  		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -7925,8 +8141,13 @@ void md_check_recovery(struct mddev *mddev)  				sysfs_notify_dirent_safe(mddev->sysfs_state);  		} -		if (mddev->flags & MD_UPDATE_SB_FLAGS) +		if (mddev->flags & MD_UPDATE_SB_FLAGS) { +			if (mddev_is_clustered(mddev)) +				md_cluster_ops->metadata_update_start(mddev);  			md_update_sb(mddev, 0); +			if (mddev_is_clustered(mddev)) +				md_cluster_ops->metadata_update_finish(mddev); +		}  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&  		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8024,6 +8245,8 @@ void md_reap_sync_thread(struct mddev *mddev)  			set_bit(MD_CHANGE_DEVS, &mddev->flags);  		}  	} +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_start(mddev);  	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&  	    mddev->pers->finish_reshape)  		mddev->pers->finish_reshape(mddev); @@ -8036,6 +8259,8 @@ void md_reap_sync_thread(struct mddev *mddev)  			rdev->saved_raid_disk = -1;  	md_update_sb(mddev, 1); +	if (mddev_is_clustered(mddev)) +		md_cluster_ops->metadata_update_finish(mddev);  	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);  	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8656,6 +8881,28 @@ err_wq:  	return ret;  } +void md_reload_sb(struct mddev *mddev) +{ +	struct md_rdev *rdev, *tmp; + +	rdev_for_each_safe(rdev, tmp, mddev) { +		rdev->sb_loaded = 0; +		ClearPageUptodate(rdev->sb_page); +	} +	mddev->raid_disks = 0; +	analyze_sbs(mddev); +	rdev_for_each_safe(rdev, tmp, mddev) { +		struct mdp_superblock_1 *sb = page_address(rdev->sb_page); +		/* since we don't write to faulty devices, we figure out if the +		 *  disk is faulty by comparing events +		 */ +		if (mddev->events > sb->events) +			set_bit(Faulty, &rdev->flags); +	} + +} +EXPORT_SYMBOL(md_reload_sb); +  #ifndef MODULE  /* diff --git a/drivers/md/md.h b/drivers/md/md.h index 318ca8fd430f..ecdce36ec6b8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -23,6 +23,7 @@  #include <linux/timer.h>  #include <linux/wait.h>  #include <linux/workqueue.h> +#include "md-cluster.h"  #define MaxSector (~(sector_t)0) @@ -170,6 +171,10 @@ enum flag_bits {  				 * a want_replacement device with same  				 * raid_disk number.  				 */ +	Candidate,		/* For clustered environments only: +				 * This device is seen locally but not +				 * by the whole cluster +				 */  };  #define BB_LEN_MASK	(0x00000000000001FFULL) @@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,  				int is_new);  extern void md_ack_all_badblocks(struct badblocks *bb); +struct md_cluster_info; +  struct mddev {  	void				*private;  	struct md_personality		*pers; @@ -430,6 +437,8 @@ struct mddev {  		unsigned long		daemon_sleep; /* how many jiffies between updates? */  		unsigned long		max_write_behind; /* write-behind mode */  		int			external; +		int			nodes; /* Maximum number of nodes in the cluster */ +		char                    cluster_name[64]; /* Name of the cluster */  	} bitmap_info;  	atomic_t			max_corr_read_errors; /* max read retries */ @@ -448,6 +457,7 @@ struct mddev {  	struct work_struct flush_work;  	struct work_struct event_work;	/* used by dm to report failure event */  	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); +	struct md_cluster_info		*cluster_info;  };  static inline int __must_check mddev_lock(struct mddev *mddev) @@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p)  extern int register_md_personality(struct md_personality *p);  extern int unregister_md_personality(struct md_personality *p); +extern int register_md_cluster_operations(struct md_cluster_operations *ops, +		struct module *module); +extern int unregister_md_cluster_operations(void); +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev);  extern struct md_thread *md_register_thread(  	void (*run)(struct md_thread *thread),  	struct mddev *mddev, @@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,  				   struct mddev *mddev);  extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); +extern void md_reload_sb(struct mddev *mddev); +extern void md_update_sb(struct mddev *mddev, int force); +extern void md_kick_rdev_from_array(struct md_rdev * rdev); +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);  static inline int mddev_check_plugged(struct mddev *mddev)  {  	return !!blk_check_plugged(md_unplug, mddev, @@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)  	}  } +extern struct md_cluster_operations *md_cluster_ops; +static inline int mddev_is_clustered(struct mddev *mddev) +{ +	return mddev->cluster_info && mddev->bitmap_info.nodes > 1; +}  #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d34e238afa54..4efa50186a2a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect  	has_nonrot_disk = 0;  	choose_next_idle = 0; -	choose_first = (conf->mddev->recovery_cp < this_sector + sectors); +	if ((conf->mddev->recovery_cp < this_sector + sectors) || +	    (mddev_is_clustered(conf->mddev) && +	    md_cluster_ops->area_resyncing(conf->mddev, this_sector, +		    this_sector + sectors))) +		choose_first = 1; +	else +		choose_first = 0;  	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {  		sector_t dist; @@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)  	md_write_start(mddev, bio); /* wait on superblock update early */  	if (bio_data_dir(bio) == WRITE && -	    bio_end_sector(bio) > mddev->suspend_lo && -	    bio->bi_iter.bi_sector < mddev->suspend_hi) { +	    ((bio_end_sector(bio) > mddev->suspend_lo && +	    bio->bi_iter.bi_sector < mddev->suspend_hi) || +	    (mddev_is_clustered(mddev) && +	     md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {  		/* As the suspend_* range is controlled by  		 * userspace, we want an interruptible  		 * wait. @@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)  			prepare_to_wait(&conf->wait_barrier,  					&w, TASK_INTERRUPTIBLE);  			if (bio_end_sector(bio) <= mddev->suspend_lo || -			    bio->bi_iter.bi_sector >= mddev->suspend_hi) +			    bio->bi_iter.bi_sector >= mddev->suspend_hi || +			    (mddev_is_clustered(mddev) && +			     !md_cluster_ops->area_resyncing(mddev, +				     bio->bi_iter.bi_sector, bio_end_sector(bio))))  				break;  			schedule();  		} @@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev)  		struct md_rdev *rdev = conf->mirrors[i].rdev;  		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;  		if (repl +		    && !test_bit(Candidate, &repl->flags)  		    && repl->recovery_offset == MaxSector  		    && !test_bit(Faulty, &repl->flags)  		    && !test_and_set_bit(In_sync, &repl->flags)) { diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 49f4210d4394..2ae6131e69a5 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -78,6 +78,12 @@  #define MD_DISK_ACTIVE		1 /* disk is running or spare disk */  #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */  #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD     4 /* Initiate a disk add across the cluster +				   * For clustered enviroments only. +				   */ +#define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed +				   * For clustered enviroments only. +				   */  #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.  				   * read requests will only be sent here in @@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s {  #define MD_SB_CLEAN		0  #define MD_SB_ERRORS		1 +#define	MD_SB_CLUSTERED		5 /* MD is clustered */  #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */  /* diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 74e7c60c4716..1cb8aa6850b5 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -62,6 +62,7 @@  #define STOP_ARRAY		_IO (MD_MAJOR, 0x32)  #define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)  #define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK	_IO (MD_MAJOR, 0x35)  /* 63 partitions with the alternate major number (mdp) */  #define MdpMinorShift 6 | 
