diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig.iosched | 29 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 5 | ||||
-rw-r--r-- | block/blk-ioc.c | 4 | ||||
-rw-r--r-- | block/blk-settings.c | 4 | ||||
-rw-r--r-- | block/cfq-iosched.c | 5 | ||||
-rw-r--r-- | block/deadline-iosched.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 43 | ||||
-rw-r--r-- | block/noop-iosched.c | 4 | ||||
-rw-r--r-- | block/row-iosched.c | 694 | ||||
-rw-r--r-- | block/sio-iosched.c | 403 |
11 files changed, 1191 insertions, 6 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795..a51d5ce75ca 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -43,6 +43,27 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. +config IOSCHED_ROW + tristate "ROW I/O scheduler" + default y + ---help--- + The ROW I/O scheduler gives priority to READ requests over the + WRITE requests when dispatching, without starving WRITE requests. + Requests are kept in priority queues. Dispatching is done in a RR + manner when the dispatch quantum for each queue is calculated + according to queue priority. + Most suitable for mobile devices. + +config IOSCHED_SIO + tristate "Simple I/O scheduler" + default y + ---help--- + The Simple I/O scheduler is an extremely simple scheduler, + based on noop and deadline, that relies on deadlines to + ensure fairness. The algorithm does not do any sorting but + basic merging, trying to keep a minimum overhead. It is aimed + mainly for aleatory access devices (eg: flash devices). + choice prompt "Default I/O scheduler" default DEFAULT_CFQ @@ -56,6 +77,12 @@ choice config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y + config DEFAULT_ROW + bool "ROW" if IOSCHED_ROW=y + + config DEFAULT_SIO + bool "SIO" if IOSCHED_SIO=y + config DEFAULT_NOOP bool "No-op" @@ -65,6 +92,8 @@ config DEFAULT_IOSCHED string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ + default "row" if DEFAULT_ROW + default "sio" if DEFAULT_SIO default "noop" if DEFAULT_NOOP endmenu diff --git a/block/Makefile b/block/Makefile index 0fec4b3fab5..43f47fff211 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,6 +13,8 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_ROW) += row-iosched.o +obj-$(CONFIG_IOSCHED_SIO) += sio-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b596e54ddd7..345843f4bb5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1660,7 +1660,10 @@ static void __exit exit_cgroup_blkio(void) { cgroup_unload_subsys(&blkio_subsys); } - +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(init_cgroup_blkio); +#else module_init(init_cgroup_blkio); +#endif module_exit(exit_cgroup_blkio); MODULE_LICENSE("GPL"); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 342eae9b0d3..9be30fe6175 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -161,4 +161,8 @@ static int __init blk_ioc_init(void) sizeof(struct io_context), 0, SLAB_PANIC, NULL); return 0; } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(blk_ioc_init); +#else subsys_initcall(blk_ioc_init); +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index fa1eb0449a0..dfd0270ec19 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -803,4 +803,8 @@ static int __init blk_settings_init(void) blk_max_pfn = max_pfn - 1; return 0; } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(blk_settings_init); +#else subsys_initcall(blk_settings_init); +#endif diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 23500ac7f0f..bfe3bbe0097 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2060,6 +2060,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfq_remove_request(rq); cfqq->dispatched++; (RQ_CFQG(rq))->dispatched++; + rq->ioprio = IOPRIO_PRIO_VALUE(cfqq->ioprio_class, cfqq->ioprio); elv_dispatch_sort(q, rq); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; @@ -4289,7 +4290,11 @@ static void __exit cfq_exit(void) cfq_slab_kill(); } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(cfq_init); +#else module_init(cfq_init); +#endif module_exit(cfq_exit); MODULE_AUTHOR("Jens Axboe"); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 5139c0ea186..301a779ea8b 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -460,7 +460,11 @@ static void __exit deadline_exit(void) elv_unregister(&iosched_deadline); } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(deadline_init); +#else module_init(deadline_init); +#endif module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe"); diff --git a/block/genhd.c b/block/genhd.c index e9a5220960b..6ece7ab1996 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -918,7 +918,11 @@ static int __init genhd_device_init(void) return 0; } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(genhd_device_init); +#else subsys_initcall(genhd_device_init); +#endif static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1118,6 +1122,26 @@ static void disk_release(struct device *dev) blk_put_queue(disk->queue); kfree(disk); } + +static int disk_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + int cnt = 0; + + disk_part_iter_init(&piter, disk, 0); + while((part = disk_part_iter_next(&piter))) + cnt++; + disk_part_iter_exit(&piter); + add_uevent_var(env, "NPARTS=%u", cnt); +#ifdef CONFIG_USB_HOST_NOTIFY + if (disk->interfaces == GENHD_IF_USB) + add_uevent_var(env, "MEDIAPRST=%d", disk->media_present); +#endif + return 0; +} + struct class block_class = { .name = "block", }; @@ -1136,6 +1160,7 @@ static struct device_type disk_type = { .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, + .uevent = disk_uevent, }; #ifdef CONFIG_PROC_FS @@ -1591,12 +1616,15 @@ static void disk_events_workfn(struct work_struct *work) struct gendisk *disk = ev->disk; char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = ev->clearing; - unsigned int events; + unsigned int events = 0; unsigned long intv; int nr_events = 0, i; - /* check events */ - events = disk->fops->check_events(disk, clearing); +#ifdef CONFIG_USB_HOST_NOTIFY + if (disk->interfaces != GENHD_IF_USB) + /* check events */ + events = disk->fops->check_events(disk, clearing); +#endif /* accumulate pending events and schedule next poll if necessary */ spin_lock_irq(&ev->lock); @@ -1620,8 +1648,13 @@ static void disk_events_workfn(struct work_struct *work) if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i]; - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +#ifdef CONFIG_USB_HOST_NOTIFY + if (disk->interfaces != GENHD_IF_USB) { + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, + KOBJ_CHANGE, envp); + } +#endif } /* diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 06389e9ef96..18f8bddf919 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -104,7 +104,11 @@ static void __exit noop_exit(void) elv_unregister(&elevator_noop); } +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(noop_init); +#else module_init(noop_init); +#endif module_exit(noop_exit); diff --git a/block/row-iosched.c b/block/row-iosched.c new file mode 100644 index 00000000000..e60ff4ef629 --- /dev/null +++ b/block/row-iosched.c @@ -0,0 +1,694 @@ +/* + * ROW (Read Over Write) I/O scheduler. + * + * Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* See Documentation/block/row-iosched.txt */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/bio.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/blktrace_api.h> +#include <linux/jiffies.h> + +/* + * enum row_queue_prio - Priorities of the ROW queues + * + * This enum defines the priorities (and the number of queues) + * the requests will be disptributed to. The higher priority - + * the bigger is the dispatch quantum given to that queue. + * ROWQ_PRIO_HIGH_READ - is the higher priority queue. + * + */ +enum row_queue_prio { + ROWQ_PRIO_HIGH_READ = 0, + ROWQ_PRIO_REG_READ, + ROWQ_PRIO_HIGH_SWRITE, + ROWQ_PRIO_REG_SWRITE, + ROWQ_PRIO_REG_WRITE, + ROWQ_PRIO_LOW_READ, + ROWQ_PRIO_LOW_SWRITE, + ROWQ_MAX_PRIO, +}; + +/* Flags indicating whether idling is enabled on the queue */ +static const bool queue_idling_enabled[] = { + true, /* ROWQ_PRIO_HIGH_READ */ + true, /* ROWQ_PRIO_REG_READ */ + false, /* ROWQ_PRIO_HIGH_SWRITE */ + false, /* ROWQ_PRIO_REG_SWRITE */ + false, /* ROWQ_PRIO_REG_WRITE */ + false, /* ROWQ_PRIO_LOW_READ */ + false, /* ROWQ_PRIO_LOW_SWRITE */ +}; + +/* Default values for row queues quantums in each dispatch cycle */ +static const int queue_quantum[] = { + 100, /* ROWQ_PRIO_HIGH_READ */ + 100, /* ROWQ_PRIO_REG_READ */ + 2, /* ROWQ_PRIO_HIGH_SWRITE */ + 1, /* ROWQ_PRIO_REG_SWRITE */ + 1, /* ROWQ_PRIO_REG_WRITE */ + 1, /* ROWQ_PRIO_LOW_READ */ + 1 /* ROWQ_PRIO_LOW_SWRITE */ +}; + +/* Default values for idling on read queues */ +#define ROW_IDLE_TIME_MSEC 5 /* msec */ +#define ROW_READ_FREQ_MSEC 20 /* msec */ + +/** + * struct rowq_idling_data - parameters for idling on the queue + * @last_insert_time: time the last request was inserted + * to the queue + * @begin_idling: flag indicating wether we should idle + * + */ +struct rowq_idling_data { + ktime_t last_insert_time; + bool begin_idling; +}; + +/** + * struct row_queue - requests grouping structure + * @rdata: parent row_data structure + * @fifo: fifo of requests + * @prio: queue priority (enum row_queue_prio) + * @nr_dispatched: number of requests already dispatched in + * the current dispatch cycle + * @slice: number of requests to dispatch in a cycle + * @idle_data: data for idling on queues + * + */ +struct row_queue { + struct row_data *rdata; + struct list_head fifo; + enum row_queue_prio prio; + + unsigned int nr_dispatched; + unsigned int slice; + + /* used only for READ queues */ + struct rowq_idling_data idle_data; +}; + +/** + * struct idling_data - data for idling on empty rqueue + * @idle_time: idling duration (jiffies) + * @freq: min time between two requests that + * triger idling (msec) + * @idle_work: pointer to struct delayed_work + * + */ +struct idling_data { + unsigned long idle_time; + u32 freq; + + struct workqueue_struct *idle_workqueue; + struct delayed_work idle_work; +}; + +/** + * struct row_queue - Per block device rqueue structure + * @dispatch_queue: dispatch rqueue + * @row_queues: array of priority request queues with + * dispatch quantum per rqueue + * @curr_queue: index in the row_queues array of the + * currently serviced rqueue + * @read_idle: data for idling after READ request + * @nr_reqs: nr_reqs[0] holds the number of all READ requests in + * scheduler, nr_reqs[1] holds the number of all WRITE + * requests in scheduler + * @cycle_flags: used for marking unserved queueus + * + */ +struct row_data { + struct request_queue *dispatch_queue; + + struct { + struct row_queue rqueue; + int disp_quantum; + } row_queues[ROWQ_MAX_PRIO]; + + enum row_queue_prio curr_queue; + + struct idling_data read_idle; + unsigned int nr_reqs[2]; + + unsigned int cycle_flags; +}; + +#define RQ_ROWQ(rq) ((struct row_queue *) ((rq)->elevator_private[0])) + +#define row_log(q, fmt, args...) \ + blk_add_trace_msg(q, "%s():" fmt , __func__, ##args) +#define row_log_rowq(rdata, rowq_id, fmt, args...) \ + blk_add_trace_msg(rdata->dispatch_queue, "rowq%d " fmt, \ + rowq_id, ##args) + +static inline void row_mark_rowq_unserved(struct row_data *rd, + enum row_queue_prio qnum) +{ + rd->cycle_flags |= (1 << qnum); +} + +static inline void row_clear_rowq_unserved(struct row_data *rd, + enum row_queue_prio qnum) +{ + rd->cycle_flags &= ~(1 << qnum); +} + +static inline int row_rowq_unserved(struct row_data *rd, + enum row_queue_prio qnum) +{ + return rd->cycle_flags & (1 << qnum); +} + +/******************** Static helper functions ***********************/ +/* + * kick_queue() - Wake up device driver queue thread + * @work: pointer to struct work_struct + * + * This is a idling delayed work function. It's purpose is to wake up the + * device driver in order for it to start fetching requests. + * + */ +static void kick_queue(struct work_struct *work) +{ + struct delayed_work *idle_work = to_delayed_work(work); + struct idling_data *read_data = + container_of(idle_work, struct idling_data, idle_work); + struct row_data *rd = + container_of(read_data, struct row_data, read_idle); + + row_log_rowq(rd, rd->curr_queue, "Performing delayed work"); + /* Mark idling process as done */ + rd->row_queues[rd->curr_queue].rqueue.idle_data.begin_idling = false; + + if (!(rd->nr_reqs[0] + rd->nr_reqs[1])) + row_log(rd->dispatch_queue, "No requests in scheduler"); + else { + spin_lock_irq(rd->dispatch_queue->queue_lock); + __blk_run_queue(rd->dispatch_queue); + spin_unlock_irq(rd->dispatch_queue->queue_lock); + } +} + +/* + * row_restart_disp_cycle() - Restart the dispatch cycle + * @rd: pointer to struct row_data + * + * This function restarts the dispatch cycle by: + * - Setting current queue to ROWQ_PRIO_HIGH_READ + * - For each queue: reset the number of requests dispatched in + * the cycle + */ +static inline void row_restart_disp_cycle(struct row_data *rd) +{ + int i; + + for (i = 0; i < ROWQ_MAX_PRIO; i++) + rd->row_queues[i].rqueue.nr_dispatched = 0; + + rd->curr_queue = ROWQ_PRIO_HIGH_READ; + row_log(rd->dispatch_queue, "Restarting cycle"); +} + +static inline void row_get_next_queue(struct row_data *rd) +{ + rd->curr_queue++; + if (rd->curr_queue == ROWQ_MAX_PRIO) + row_restart_disp_cycle(rd); +} + +/******************* Elevator callback functions *********************/ + +/* + * row_add_request() - Add request to the scheduler + * @q: requests queue + * @rq: request to add + * + */ +static void row_add_request(struct request_queue *q, + struct request *rq) +{ + struct row_data *rd = (struct row_data *)q->elevator->elevator_data; + struct row_queue *rqueue = RQ_ROWQ(rq); + + list_add_tail(&rq->queuelist, &rqueue->fifo); + rd->nr_reqs[rq_data_dir(rq)]++; + rq_set_fifo_time(rq, jiffies); /* for statistics*/ + + if (queue_idling_enabled[rqueue->prio]) { + if (delayed_work_pending(&rd->read_idle.idle_work)) + (void)cancel_delayed_work( + &rd->read_idle.idle_work); + if (ktime_to_ms(ktime_sub(ktime_get(), + rqueue->idle_data.last_insert_time)) < + rd->read_idle.freq) { + rqueue->idle_data.begin_idling = true; + row_log_rowq(rd, rqueue->prio, "Enable idling"); + } else { + rqueue->idle_data.begin_idling = false; + row_log_rowq(rd, rqueue->prio, "Disable idling"); + } + + rqueue->idle_data.last_insert_time = ktime_get(); + } + row_log_rowq(rd, rqueue->prio, "added request"); +} + +/* + * row_remove_request() - Remove given request from scheduler + * @q: requests queue + * @rq: request to remove + * + */ +static void row_remove_request(struct request_queue *q, + struct request *rq) +{ + struct row_data *rd = (struct row_data *)q->elevator->elevator_data; + + rq_fifo_clear(rq); + rd->nr_reqs[rq_data_dir(rq)]--; +} + +/* + * row_dispatch_insert() - move request to dispatch queue + * @rd: pointer to struct row_data + * + * This function moves the next request to dispatch from + * rd->curr_queue to the dispatch queue + * + */ +static void row_dispatch_insert(struct row_data *rd) +{ + struct request *rq; + + rq = rq_entry_fifo(rd->row_queues[rd->curr_queue].rqueue.fifo.next); + row_remove_request(rd->dispatch_queue, rq); + elv_dispatch_add_tail(rd->dispatch_queue, rq); + rd->row_queues[rd->curr_queue].rqueue.nr_dispatched++; + row_clear_rowq_unserved(rd, rd->curr_queue); + row_log_rowq(rd, rd->curr_queue, " Dispatched request nr_disp = %d", + rd->row_queues[rd->curr_queue].rqueue.nr_dispatched); +} + +/* + * row_choose_queue() - choose the next queue to dispatch from + * @rd: pointer to struct row_data + * + * Updates rd->curr_queue. Returns 1 if there are requests to + * dispatch, 0 if there are no requests in scheduler + * + */ +static int row_choose_queue(struct row_data *rd) +{ + int prev_curr_queue = rd->curr_queue; + + if (!(rd->nr_reqs[0] + rd->nr_reqs[1])) { + row_log(rd->dispatch_queue, "No more requests in scheduler"); + return 0; + } + + row_get_next_queue(rd); + + /* + * Loop over all queues to find the next queue that is not empty. + * Stop when you get back to curr_queue + */ + while (list_empty(&rd->row_queues[rd->curr_queue].rqueue.fifo) + && rd->curr_queue != prev_curr_queue) { + /* Mark rqueue as unserved */ + row_mark_rowq_unserved(rd, rd->curr_queue); + row_get_next_queue(rd); + } + + return 1; +} + +/* + * row_dispatch_requests() - selects the next request to dispatch + * @q: requests queue + * @force: ignored + * + * Return 0 if no requests were moved to the dispatch queue. + * 1 otherwise + * + */ +static int row_dispatch_requests(struct request_queue *q, int force) +{ + struct row_data *rd = (struct row_data *)q->elevator->elevator_data; + int ret = 0, currq, i; + + currq = rd->curr_queue; + + /* + * Find the first unserved queue (with higher priority then currq) + * that is not empty + */ + for (i = 0; i < currq; i++) { + if (row_rowq_unserved(rd, i) && + !list_empty(&rd->row_queues[i].rqueue.fifo)) { + row_log_rowq(rd, currq, + " Preemting for unserved rowq%d", i); + rd->curr_queue = i; + row_dispatch_insert(rd); + ret = 1; + goto done; + } + } + + if (rd->row_queues[currq].rqueue.nr_dispatched >= + rd->row_queues[currq].disp_quantum) { + rd->row_queues[currq].rqueue.nr_dispatched = 0; + row_log_rowq(rd, currq, "Expiring rqueue"); + ret = row_choose_queue(rd); + if (ret) + row_dispatch_insert(rd); + goto done; + } + + /* Dispatch from curr_queue */ + if (list_empty(&rd->row_queues[currq].rqueue.fifo)) { + /* check idling */ + if (delayed_work_pending(&rd->read_idle.idle_work)) { + if (force) { + (void)cancel_delayed_work( + &rd->read_idle.idle_work); + row_log_rowq(rd, currq, + "Canceled delayed work - forced dispatch"); + } else { + row_log_rowq(rd, currq, + "Delayed work pending. Exiting"); + goto done; + } + } + + if (!force && queue_idling_enabled[currq] && + rd->row_queues[currq].rqueue.idle_data.begin_idling) { + if (!queue_delayed_work(rd->read_idle.idle_workqueue, + &rd->read_idle.idle_work, + rd->read_idle.idle_time)) { + row_log_rowq(rd, currq, + "Work already on queue!"); + pr_err("ROW_BUG: Work already on queue!"); + } else + row_log_rowq(rd, currq, + "Scheduled delayed work. exiting"); + goto done; + } else { + row_log_rowq(rd, currq, + "Currq empty. Choose next queue"); + ret = row_choose_queue(rd); + if (!ret) + goto done; + } + } + + ret = 1; + row_dispatch_insert(rd); + +done: + return ret; +} + +/* + * row_init_queue() - Init scheduler data structures + * @q: requests queue + * + * Return pointer to struct row_data to be saved in elevator for + * this dispatch queue + * + */ +static void *row_init_queue(struct request_queue *q) +{ + + struct row_data *rdata; + int i; + + rdata = kmalloc_node(sizeof(*rdata), + GFP_KERNEL | __GFP_ZERO, q->node); + if (!rdata) + return NULL; + + for (i = 0; i < ROWQ_MAX_PRIO; i++) { + INIT_LIST_HEAD(&rdata->row_queues[i].rqueue.fifo); + rdata->row_queues[i].disp_quantum = queue_quantum[i]; + rdata->row_queues[i].rqueue.rdata = rdata; + rdata->row_queues[i].rqueue.prio = i; + rdata->row_queues[i].rqueue.idle_data.begin_idling = false; + rdata->row_queues[i].rqueue.idle_data.last_insert_time = + ktime_set(0, 0); + } + + /* + * Currently idling is enabled only for READ queues. If we want to + * enable it for write queues also, note that idling frequency will + * be the same in both cases + */ + rdata->read_idle.idle_time = msecs_to_jiffies(ROW_IDLE_TIME_MSEC); + /* Maybe 0 on some platforms */ + if (!rdata->read_idle.idle_time) + rdata->read_idle.idle_time = 1; + rdata->read_idle.freq = ROW_READ_FREQ_MSEC; + rdata->read_idle.idle_workqueue = alloc_workqueue("row_idle_work", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!rdata->read_idle.idle_workqueue) + panic("Failed to create idle workqueue\n"); + INIT_DELAYED_WORK(&rdata->read_idle.idle_work, kick_queue); + + rdata->curr_queue = ROWQ_PRIO_HIGH_READ; + rdata->dispatch_queue = q; + + rdata->nr_reqs[READ] = rdata->nr_reqs[WRITE] = 0; + + return rdata; +} + +/* + * row_exit_queue() - called on unloading the RAW scheduler + * @e: poiner to struct elevator_queue + * + */ +static void row_exit_queue(struct elevator_queue *e) +{ + struct row_data *rd = (struct row_data *)e->elevator_data; + int i; + + for (i = 0; i < ROWQ_MAX_PRIO; i++) + BUG_ON(!list_empty(&rd->row_queues[i].rqueue.fifo)); + (void)cancel_delayed_work_sync(&rd->read_idle.idle_work); + BUG_ON(delayed_work_pending(&rd->read_idle.idle_work)); + destroy_workqueue(rd->read_idle.idle_workqueue); + kfree(rd); +} + +/* + * row_merged_requests() - Called when 2 requests are merged + * @q: requests queue + * @rq: request the two requests were merged into + * @next: request that was merged + */ +static void row_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct row_queue *rqueue = RQ_ROWQ(next); + + list_del_init(&next->queuelist); + + rqueue->rdata->nr_reqs[rq_data_dir(rq)]--; +} + +/* + * get_queue_type() - Get queue type for a given request + * + * This is a helping function which purpose is to determine what + * ROW queue the given request should be added to (and + * dispatched from leter on) + * + * TODO: Right now only 3 queues are used REG_READ, REG_WRITE + * and REG_SWRITE + */ +static enum row_queue_prio get_queue_type(struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + const bool is_sync = rq_is_sync(rq); + + if (data_dir == READ) + return ROWQ_PRIO_REG_READ; + else if (is_sync) + return ROWQ_PRIO_REG_SWRITE; + else + return ROWQ_PRIO_REG_WRITE; +} + +/* + * row_set_request() - Set ROW data structures associated with this request. + * @q: requests queue + * @rq: pointer to the request + * @gfp_mask: ignored + * + */ +static int +row_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) +{ + struct row_data *rd = (struct row_data *)q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + rq->elevator_private[0] = + (void *)(&rd->row_queues[get_queue_type(rq)]); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +} + +/********** Helping sysfs functions/defenitions for ROW attributes ******/ +static ssize_t row_var_show(int var, char *page) +{ + return snprintf(page, 100, "%d\n", var); +} + +static ssize_t row_var_store(int *var, const char *page, size_t count) +{ + int err; + err = kstrtoul(page, 10, (unsigned long *)var); + + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct row_data *rowd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return row_var_show(__data, (page)); \ +} +SHOW_FUNCTION(row_hp_read_quantum_show, + rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum, 0); +SHOW_FUNCTION(row_rp_read_quantum_show, + rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum, 0); +SHOW_FUNCTION(row_hp_swrite_quantum_show, + rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum, 0); +SHOW_FUNCTION(row_rp_swrite_quantum_show, + rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum, 0); +SHOW_FUNCTION(row_rp_write_quantum_show, + rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum, 0); +SHOW_FUNCTION(row_lp_read_quantum_show, + rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum, 0); +SHOW_FUNCTION(row_lp_swrite_quantum_show, + rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum, 0); +SHOW_FUNCTION(row_read_idle_show, rowd->read_idle.idle_time, 1); +SHOW_FUNCTION(row_read_idle_freq_show, rowd->read_idle.freq, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, \ + const char *page, size_t count) \ +{ \ + struct row_data *rowd = e->elevator_data; \ + int __data; \ + int ret = row_var_store(&__data, (page), count); \ + if (__CONV) \ + __data = (int)msecs_to_jiffies(__data); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(row_hp_read_quantum_store, +&rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum, 1, INT_MAX, 0); +STORE_FUNCTION(row_rp_read_quantum_store, + &rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum, + 1, INT_MAX, 0); +STORE_FUNCTION(row_hp_swrite_quantum_store, + &rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum, + 1, INT_MAX, 0); +STORE_FUNCTION(row_rp_swrite_quantum_store, + &rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum, + 1, INT_MAX, 0); +STORE_FUNCTION(row_rp_write_quantum_store, + &rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum, + 1, INT_MAX, 0); +STORE_FUNCTION(row_lp_read_quantum_store, + &rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum, + 1, INT_MAX, 0); +STORE_FUNCTION(row_lp_swrite_quantum_store, + &rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum, + 1, INT_MAX, 1); +STORE_FUNCTION(row_read_idle_store, &rowd->read_idle.idle_time, 1, INT_MAX, 1); +STORE_FUNCTION(row_read_idle_freq_store, &rowd->read_idle.freq, 1, INT_MAX, 0); + +#undef STORE_FUNCTION + +#define ROW_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, row_##name##_show, \ + row_##name##_store) + +static struct elv_fs_entry row_attrs[] = { + ROW_ATTR(hp_read_quantum), + ROW_ATTR(rp_read_quantum), + ROW_ATTR(hp_swrite_quantum), + ROW_ATTR(rp_swrite_quantum), + ROW_ATTR(rp_write_quantum), + ROW_ATTR(lp_read_quantum), + ROW_ATTR(lp_swrite_quantum), + ROW_ATTR(read_idle), + ROW_ATTR(read_idle_freq), + __ATTR_NULL +}; + +static struct elevator_type iosched_row = { + .ops = { + .elevator_merge_req_fn = row_merged_requests, + .elevator_dispatch_fn = row_dispatch_requests, + .elevator_add_req_fn = row_add_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_set_req_fn = row_set_request, + .elevator_init_fn = row_init_queue, + .elevator_exit_fn = row_exit_queue, + }, + + .elevator_attrs = row_attrs, + .elevator_name = "row", + .elevator_owner = THIS_MODULE, +}; + +static int __init row_init(void) +{ + elv_register(&iosched_row); + return 0; +} + +static void __exit row_exit(void) +{ + elv_unregister(&iosched_row); +} + +module_init(row_init); +module_exit(row_exit); + +MODULE_LICENSE("GPLv2"); +MODULE_DESCRIPTION("Read Over Write IO scheduler"); diff --git a/block/sio-iosched.c b/block/sio-iosched.c new file mode 100644 index 00000000000..3661a9a9207 --- /dev/null +++ b/block/sio-iosched.c @@ -0,0 +1,403 @@ +/* + * Simple IO scheduler + * Based on Noop, Deadline and V(R) IO schedulers. + * + * Copyright (C) 2012 Miguel Boton <mboton@gmail.com> + * + * + * This algorithm does not do any kind of sorting, as it is aimed for + * aleatory access devices, but it does some basic merging. We try to + * keep minimum overhead to achieve low latency. + * + * Asynchronous and synchronous requests are not treated separately, but + * we relay on deadlines to ensure fairness. + * + */ +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/bio.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/version.h> + +enum { ASYNC, SYNC }; + +/* Tunables */ +static const int sync_read_expire = HZ / 2; /* max time before a sync read is submitted. */ +static const int sync_write_expire = 2 * HZ; /* max time before a sync write is submitted. */ + +static const int async_read_expire = 4 * HZ; /* ditto for async, these limits are SOFT! */ +static const int async_write_expire = 16 * HZ; /* ditto for async, these limits are SOFT! */ + +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 8; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +/* Elevator data */ +struct sio_data { + /* Request queues */ + struct list_head fifo_list[2][2]; + + /* Attributes */ + unsigned int batched; + unsigned int starved; + + /* Settings */ + int fifo_expire[2][2]; + int fifo_batch; + int writes_starved; +}; + +static void +sio_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + /* + * If next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo. + */ + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { + if (time_before(rq_fifo_time(next), rq_fifo_time(rq))) { + list_move(&rq->queuelist, &next->queuelist); + rq_set_fifo_time(rq, rq_fifo_time(next)); + } + } + + /* Delete next request */ + rq_fifo_clear(next); +} + +static void +sio_add_request(struct request_queue *q, struct request *rq) +{ + struct sio_data *sd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); + const int data_dir = rq_data_dir(rq); + + /* + * Add request to the proper fifo list and set its + * expire time. + */ + rq_set_fifo_time(rq, jiffies + sd->fifo_expire[sync][data_dir]); + list_add_tail(&rq->queuelist, &sd->fifo_list[sync][data_dir]); +} + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38) +static int +sio_queue_empty(struct request_queue *q) +{ + struct sio_data *sd = q->elevator->elevator_data; + + /* Check if fifo lists are empty */ + return list_empty(&sd->fifo_list[SYNC][READ]) && list_empty(&sd->fifo_list[SYNC][WRITE]) && + list_empty(&sd->fifo_list[ASYNC][READ]) && list_empty(&sd->fifo_list[ASYNC][WRITE]); +} +#endif + +static struct request * +sio_expired_request(struct sio_data *sd, int sync, int data_dir) +{ + struct list_head *list = &sd->fifo_list[sync][data_dir]; + struct request *rq; + + if (list_empty(list)) + return NULL; + + /* Retrieve request */ + rq = rq_entry_fifo(list->next); + + /* Request has expired */ + if (time_after(jiffies, rq_fifo_time(rq))) + return rq; + + return NULL; +} + +static struct request * +sio_choose_expired_request(struct sio_data *sd) +{ + struct request *rq; + + /* + * Check expired requests. + * Asynchronous requests have priority over synchronous. + * Write requests have priority over read. + */ + rq = sio_expired_request(sd, ASYNC, WRITE); + if (rq) + return rq; + rq = sio_expired_request(sd, ASYNC, READ); + if (rq) + return rq; + + rq = sio_expired_request(sd, SYNC, WRITE); + if (rq) + return rq; + rq = sio_expired_request(sd, SYNC, READ); + if (rq) + return rq; + + return NULL; +} + +static struct request * +sio_choose_request(struct sio_data *sd, int data_dir) +{ + struct list_head *sync = sd->fifo_list[SYNC]; + struct list_head *async = sd->fifo_list[ASYNC]; + + /* + * Retrieve request from available fifo list. + * Synchronous requests have priority over asynchronous. + * Read requests have priority over write. + */ + if (!list_empty(&sync[data_dir])) + return rq_entry_fifo(sync[data_dir].next); + if (!list_empty(&async[data_dir])) + return rq_entry_fifo(async[data_dir].next); + + if (!list_empty(&sync[!data_dir])) + return rq_entry_fifo(sync[!data_dir].next); + if (!list_empty(&async[!data_dir])) + return rq_entry_fifo(async[!data_dir].next); + + return NULL; +} + +static inline void +sio_dispatch_request(struct sio_data *sd, struct request *rq) +{ + /* + * Remove the request from the fifo list + * and dispatch it. + */ + rq_fifo_clear(rq); + elv_dispatch_add_tail(rq->q, rq); + + sd->batched++; + + if (rq_data_dir(rq)) + sd->starved = 0; + else + sd->starved++; +} + +static int +sio_dispatch_requests(struct request_queue *q, int force) +{ + struct sio_data *sd = q->elevator->elevator_data; + struct request *rq = NULL; + int data_dir = READ; + + /* + * Retrieve any expired request after a batch of + * sequential requests. + */ + if (sd->batched > sd->fifo_batch) { + sd->batched = 0; + rq = sio_choose_expired_request(sd); + } + + /* Retrieve request */ + if (!rq) { + if (sd->starved > sd->writes_starved) + data_dir = WRITE; + + rq = sio_choose_request(sd, data_dir); + if (!rq) + return 0; + } + + /* Dispatch request */ + sio_dispatch_request(sd, rq); + + return 1; +} + +static struct request * +sio_former_request(struct request_queue *q, struct request *rq) +{ + struct sio_data *sd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); + const int data_dir = rq_data_dir(rq); + + if (rq->queuelist.prev == &sd->fifo_list[sync][data_dir]) + return NULL; + + /* Return former request */ + return list_entry(rq->queuelist.prev, struct request, queuelist); +} + +static struct request * +sio_latter_request(struct request_queue *q, struct request *rq) +{ + struct sio_data *sd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); + const int data_dir = rq_data_dir(rq); + + if (rq->queuelist.next == &sd->fifo_list[sync][data_dir]) + return NULL; + + /* Return latter request */ + return list_entry(rq->queuelist.next, struct request, queuelist); +} + +static void * +sio_init_queue(struct request_queue *q) +{ + struct sio_data *sd; + + /* Allocate structure */ + sd = kmalloc_node(sizeof(*sd), GFP_KERNEL, q->node); + if (!sd) + return NULL; + + /* Initialize fifo lists */ + INIT_LIST_HEAD(&sd->fifo_list[SYNC][READ]); + INIT_LIST_HEAD(&sd->fifo_list[SYNC][WRITE]); + INIT_LIST_HEAD(&sd->fifo_list[ASYNC][READ]); + INIT_LIST_HEAD(&sd->fifo_list[ASYNC][WRITE]); + + /* Initialize data */ + sd->batched = 0; + sd->fifo_expire[SYNC][READ] = sync_read_expire; + sd->fifo_expire[SYNC][WRITE] = sync_write_expire; + sd->fifo_expire[ASYNC][READ] = async_read_expire; + sd->fifo_expire[ASYNC][WRITE] = async_write_expire; + sd->fifo_batch = fifo_batch; + + return sd; +} + +static void +sio_exit_queue(struct elevator_queue *e) +{ + struct sio_data *sd = e->elevator_data; + + BUG_ON(!list_empty(&sd->fifo_list[SYNC][READ])); + BUG_ON(!list_empty(&sd->fifo_list[SYNC][WRITE])); + BUG_ON(!list_empty(&sd->fifo_list[ASYNC][READ])); + BUG_ON(!list_empty(&sd->fifo_list[ASYNC][WRITE])); + + /* Free structure */ + kfree(sd); +} + +/* + * sysfs code + */ + +static ssize_t +sio_var_show(int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +sio_var_store(int *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtol(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct sio_data *sd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return sio_var_show(__data, (page)); \ +} +SHOW_FUNCTION(sio_sync_read_expire_show, sd->fifo_expire[SYNC][READ], 1); +SHOW_FUNCTION(sio_sync_write_expire_show, sd->fifo_expire[SYNC][WRITE], 1); +SHOW_FUNCTION(sio_async_read_expire_show, sd->fifo_expire[ASYNC][READ], 1); +SHOW_FUNCTION(sio_async_write_expire_show, sd->fifo_expire[ASYNC][WRITE], 1); +SHOW_FUNCTION(sio_fifo_batch_show, sd->fifo_batch, 0); +SHOW_FUNCTION(sio_writes_starved_show, sd->writes_starved, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct sio_data *sd = e->elevator_data; \ + int __data; \ + int ret = sio_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(sio_sync_read_expire_store, &sd->fifo_expire[SYNC][READ], 0, INT_MAX, 1); +STORE_FUNCTION(sio_sync_write_expire_store, &sd->fifo_expire[SYNC][WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(sio_async_read_expire_store, &sd->fifo_expire[ASYNC][READ], 0, INT_MAX, 1); +STORE_FUNCTION(sio_async_write_expire_store, &sd->fifo_expire[ASYNC][WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(sio_fifo_batch_store, &sd->fifo_batch, 0, INT_MAX, 0); +STORE_FUNCTION(sio_writes_starved_store, &sd->writes_starved, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +#define DD_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, sio_##name##_show, \ + sio_##name##_store) + +static struct elv_fs_entry sio_attrs[] = { + DD_ATTR(sync_read_expire), + DD_ATTR(sync_write_expire), + DD_ATTR(async_read_expire), + DD_ATTR(async_write_expire), + DD_ATTR(fifo_batch), + DD_ATTR(writes_starved), + __ATTR_NULL +}; + +static struct elevator_type iosched_sio = { + .ops = { + .elevator_merge_req_fn = sio_merged_requests, + .elevator_dispatch_fn = sio_dispatch_requests, + .elevator_add_req_fn = sio_add_request, +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38) + .elevator_queue_empty_fn = sio_queue_empty, +#endif + .elevator_former_req_fn = sio_former_request, + .elevator_latter_req_fn = sio_latter_request, + .elevator_init_fn = sio_init_queue, + .elevator_exit_fn = sio_exit_queue, + }, + + .elevator_attrs = sio_attrs, + .elevator_name = "sio", + .elevator_owner = THIS_MODULE, +}; + +static int __init sio_init(void) +{ + /* Register elevator */ + elv_register(&iosched_sio); + + return 0; +} + +static void __exit sio_exit(void) +{ + /* Unregister elevator */ + elv_unregister(&iosched_sio); +} + +#ifdef CONFIG_FAST_RESUME +beforeresume_initcall(sio_init); +#else +module_init(sio_init); +#endif +module_exit(sio_exit); + +MODULE_AUTHOR("Miguel Boton"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Simple IO scheduler"); +MODULE_VERSION("0.2"); |