diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..ca8033e6d699 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -754,6 +785,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; @@ -786,7 +818,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 78810e726222..e1a23c8b676d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -623,7 +628,19 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } -static void blk_zone_wplug_bio_work(struct work_struct *work); +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); + + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ + disk_put_zone_wplug(zwplug); +} /* * Get a zone write plug for the zone containing @sector. @@ -658,6 +675,7 @@ static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; /* @@ -690,6 +708,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -703,6 +722,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1137,8 +1170,8 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); @@ -1151,6 +1184,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) disk_put_zone_wplug(zwplug); @@ -1190,6 +1224,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1423,6 +1473,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1445,7 +1502,10 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1586,16 +1646,22 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + /* + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. + */ + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1685,10 +1751,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1704,7 +1769,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1718,14 +1783,15 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1733,14 +1799,78 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1756,6 +1886,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1781,8 +1912,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1790,7 +1934,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1840,6 +1984,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a49a1e38c6e7..ef6457487d23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,10 @@ struct gendisk { struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q);