/* * ROW (Read Over Write) I/O scheduler. * * Copyright (c) 2012, The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* See Documentation/block/row-iosched.txt */ #include #include #include #include #include #include #include #include #include #include #include /* * enum row_queue_prio - Priorities of the ROW queues * * This enum defines the priorities (and the number of queues) * the requests will be disptributed to. The higher priority - * the bigger is the dispatch quantum given to that queue. * ROWQ_PRIO_HIGH_READ - is the higher priority queue. * */ enum row_queue_prio { ROWQ_PRIO_HIGH_READ = 0, ROWQ_PRIO_REG_READ, ROWQ_PRIO_HIGH_SWRITE, ROWQ_PRIO_REG_SWRITE, ROWQ_PRIO_REG_WRITE, ROWQ_PRIO_LOW_READ, ROWQ_PRIO_LOW_SWRITE, ROWQ_MAX_PRIO, }; /* Flags indicating whether idling is enabled on the queue */ static const bool queue_idling_enabled[] = { true, /* ROWQ_PRIO_HIGH_READ */ true, /* ROWQ_PRIO_REG_READ */ false, /* ROWQ_PRIO_HIGH_SWRITE */ false, /* ROWQ_PRIO_REG_SWRITE */ false, /* ROWQ_PRIO_REG_WRITE */ false, /* ROWQ_PRIO_LOW_READ */ false, /* ROWQ_PRIO_LOW_SWRITE */ }; /* Default values for row queues quantums in each dispatch cycle */ static const int queue_quantum[] = { 100, /* ROWQ_PRIO_HIGH_READ */ 100, /* ROWQ_PRIO_REG_READ */ 2, /* ROWQ_PRIO_HIGH_SWRITE */ 1, /* ROWQ_PRIO_REG_SWRITE */ 1, /* ROWQ_PRIO_REG_WRITE */ 1, /* ROWQ_PRIO_LOW_READ */ 1 /* ROWQ_PRIO_LOW_SWRITE */ }; /* Default values for idling on read queues */ #define ROW_IDLE_TIME_MSEC 5 /* msec */ #define ROW_READ_FREQ_MSEC 20 /* msec */ /** * struct rowq_idling_data - parameters for idling on the queue * @last_insert_time: time the last request was inserted * to the queue * @begin_idling: flag indicating wether we should idle * */ struct rowq_idling_data { ktime_t last_insert_time; bool begin_idling; }; /** * struct row_queue - requests grouping structure * @rdata: parent row_data structure * @fifo: fifo of requests * @prio: queue priority (enum row_queue_prio) * @nr_dispatched: number of requests already dispatched in * the current dispatch cycle * @slice: number of requests to dispatch in a cycle * @idle_data: data for idling on queues * */ struct row_queue { struct row_data *rdata; struct list_head fifo; enum row_queue_prio prio; unsigned int nr_dispatched; unsigned int slice; /* used only for READ queues */ struct rowq_idling_data idle_data; }; /** * struct idling_data - data for idling on empty rqueue * @idle_time: idling duration (jiffies) * @freq: min time between two requests that * triger idling (msec) * @idle_work: pointer to struct delayed_work * */ struct idling_data { unsigned long idle_time; u32 freq; struct workqueue_struct *idle_workqueue; struct delayed_work idle_work; }; /** * struct row_queue - Per block device rqueue structure * @dispatch_queue: dispatch rqueue * @row_queues: array of priority request queues with * dispatch quantum per rqueue * @curr_queue: index in the row_queues array of the * currently serviced rqueue * @read_idle: data for idling after READ request * @nr_reqs: nr_reqs[0] holds the number of all READ requests in * scheduler, nr_reqs[1] holds the number of all WRITE * requests in scheduler * @cycle_flags: used for marking unserved queueus * */ struct row_data { struct request_queue *dispatch_queue; struct { struct row_queue rqueue; int disp_quantum; } row_queues[ROWQ_MAX_PRIO]; enum row_queue_prio curr_queue; struct idling_data read_idle; unsigned int nr_reqs[2]; unsigned int cycle_flags; }; #define RQ_ROWQ(rq) ((struct row_queue *) ((rq)->elevator_private[0])) #define row_log(q, fmt, args...) \ blk_add_trace_msg(q, "%s():" fmt , __func__, ##args) #define row_log_rowq(rdata, rowq_id, fmt, args...) \ blk_add_trace_msg(rdata->dispatch_queue, "rowq%d " fmt, \ rowq_id, ##args) static inline void row_mark_rowq_unserved(struct row_data *rd, enum row_queue_prio qnum) { rd->cycle_flags |= (1 << qnum); } static inline void row_clear_rowq_unserved(struct row_data *rd, enum row_queue_prio qnum) { rd->cycle_flags &= ~(1 << qnum); } static inline int row_rowq_unserved(struct row_data *rd, enum row_queue_prio qnum) { return rd->cycle_flags & (1 << qnum); } /******************** Static helper functions ***********************/ /* * kick_queue() - Wake up device driver queue thread * @work: pointer to struct work_struct * * This is a idling delayed work function. It's purpose is to wake up the * device driver in order for it to start fetching requests. * */ static void kick_queue(struct work_struct *work) { struct delayed_work *idle_work = to_delayed_work(work); struct idling_data *read_data = container_of(idle_work, struct idling_data, idle_work); struct row_data *rd = container_of(read_data, struct row_data, read_idle); row_log_rowq(rd, rd->curr_queue, "Performing delayed work"); /* Mark idling process as done */ rd->row_queues[rd->curr_queue].rqueue.idle_data.begin_idling = false; if (!(rd->nr_reqs[0] + rd->nr_reqs[1])) row_log(rd->dispatch_queue, "No requests in scheduler"); else { spin_lock_irq(rd->dispatch_queue->queue_lock); __blk_run_queue(rd->dispatch_queue); spin_unlock_irq(rd->dispatch_queue->queue_lock); } } /* * row_restart_disp_cycle() - Restart the dispatch cycle * @rd: pointer to struct row_data * * This function restarts the dispatch cycle by: * - Setting current queue to ROWQ_PRIO_HIGH_READ * - For each queue: reset the number of requests dispatched in * the cycle */ static inline void row_restart_disp_cycle(struct row_data *rd) { int i; for (i = 0; i < ROWQ_MAX_PRIO; i++) rd->row_queues[i].rqueue.nr_dispatched = 0; rd->curr_queue = ROWQ_PRIO_HIGH_READ; row_log(rd->dispatch_queue, "Restarting cycle"); } static inline void row_get_next_queue(struct row_data *rd) { rd->curr_queue++; if (rd->curr_queue == ROWQ_MAX_PRIO) row_restart_disp_cycle(rd); } /******************* Elevator callback functions *********************/ /* * row_add_request() - Add request to the scheduler * @q: requests queue * @rq: request to add * */ static void row_add_request(struct request_queue *q, struct request *rq) { struct row_data *rd = (struct row_data *)q->elevator->elevator_data; struct row_queue *rqueue = RQ_ROWQ(rq); list_add_tail(&rq->queuelist, &rqueue->fifo); rd->nr_reqs[rq_data_dir(rq)]++; rq_set_fifo_time(rq, jiffies); /* for statistics*/ if (queue_idling_enabled[rqueue->prio]) { if (delayed_work_pending(&rd->read_idle.idle_work)) (void)cancel_delayed_work( &rd->read_idle.idle_work); if (ktime_to_ms(ktime_sub(ktime_get(), rqueue->idle_data.last_insert_time)) < rd->read_idle.freq) { rqueue->idle_data.begin_idling = true; row_log_rowq(rd, rqueue->prio, "Enable idling"); } else { rqueue->idle_data.begin_idling = false; row_log_rowq(rd, rqueue->prio, "Disable idling"); } rqueue->idle_data.last_insert_time = ktime_get(); } row_log_rowq(rd, rqueue->prio, "added request"); } /* * row_remove_request() - Remove given request from scheduler * @q: requests queue * @rq: request to remove * */ static void row_remove_request(struct request_queue *q, struct request *rq) { struct row_data *rd = (struct row_data *)q->elevator->elevator_data; rq_fifo_clear(rq); rd->nr_reqs[rq_data_dir(rq)]--; } /* * row_dispatch_insert() - move request to dispatch queue * @rd: pointer to struct row_data * * This function moves the next request to dispatch from * rd->curr_queue to the dispatch queue * */ static void row_dispatch_insert(struct row_data *rd) { struct request *rq; rq = rq_entry_fifo(rd->row_queues[rd->curr_queue].rqueue.fifo.next); row_remove_request(rd->dispatch_queue, rq); elv_dispatch_add_tail(rd->dispatch_queue, rq); rd->row_queues[rd->curr_queue].rqueue.nr_dispatched++; row_clear_rowq_unserved(rd, rd->curr_queue); row_log_rowq(rd, rd->curr_queue, " Dispatched request nr_disp = %d", rd->row_queues[rd->curr_queue].rqueue.nr_dispatched); } /* * row_choose_queue() - choose the next queue to dispatch from * @rd: pointer to struct row_data * * Updates rd->curr_queue. Returns 1 if there are requests to * dispatch, 0 if there are no requests in scheduler * */ static int row_choose_queue(struct row_data *rd) { int prev_curr_queue = rd->curr_queue; if (!(rd->nr_reqs[0] + rd->nr_reqs[1])) { row_log(rd->dispatch_queue, "No more requests in scheduler"); return 0; } row_get_next_queue(rd); /* * Loop over all queues to find the next queue that is not empty. * Stop when you get back to curr_queue */ while (list_empty(&rd->row_queues[rd->curr_queue].rqueue.fifo) && rd->curr_queue != prev_curr_queue) { /* Mark rqueue as unserved */ row_mark_rowq_unserved(rd, rd->curr_queue); row_get_next_queue(rd); } return 1; } /* * row_dispatch_requests() - selects the next request to dispatch * @q: requests queue * @force: ignored * * Return 0 if no requests were moved to the dispatch queue. * 1 otherwise * */ static int row_dispatch_requests(struct request_queue *q, int force) { struct row_data *rd = (struct row_data *)q->elevator->elevator_data; int ret = 0, currq, i; currq = rd->curr_queue; /* * Find the first unserved queue (with higher priority then currq) * that is not empty */ for (i = 0; i < currq; i++) { if (row_rowq_unserved(rd, i) && !list_empty(&rd->row_queues[i].rqueue.fifo)) { row_log_rowq(rd, currq, " Preemting for unserved rowq%d", i); rd->curr_queue = i; row_dispatch_insert(rd); ret = 1; goto done; } } if (rd->row_queues[currq].rqueue.nr_dispatched >= rd->row_queues[currq].disp_quantum) { rd->row_queues[currq].rqueue.nr_dispatched = 0; row_log_rowq(rd, currq, "Expiring rqueue"); ret = row_choose_queue(rd); if (ret) row_dispatch_insert(rd); goto done; } /* Dispatch from curr_queue */ if (list_empty(&rd->row_queues[currq].rqueue.fifo)) { /* check idling */ if (delayed_work_pending(&rd->read_idle.idle_work)) { if (force) { (void)cancel_delayed_work( &rd->read_idle.idle_work); row_log_rowq(rd, currq, "Canceled delayed work - forced dispatch"); } else { row_log_rowq(rd, currq, "Delayed work pending. Exiting"); goto done; } } if (!force && queue_idling_enabled[currq] && rd->row_queues[currq].rqueue.idle_data.begin_idling) { if (!queue_delayed_work(rd->read_idle.idle_workqueue, &rd->read_idle.idle_work, rd->read_idle.idle_time)) { row_log_rowq(rd, currq, "Work already on queue!"); pr_err("ROW_BUG: Work already on queue!"); } else row_log_rowq(rd, currq, "Scheduled delayed work. exiting"); goto done; } else { row_log_rowq(rd, currq, "Currq empty. Choose next queue"); ret = row_choose_queue(rd); if (!ret) goto done; } } ret = 1; row_dispatch_insert(rd); done: return ret; } /* * row_init_queue() - Init scheduler data structures * @q: requests queue * * Return pointer to struct row_data to be saved in elevator for * this dispatch queue * */ static void *row_init_queue(struct request_queue *q) { struct row_data *rdata; int i; rdata = kmalloc_node(sizeof(*rdata), GFP_KERNEL | __GFP_ZERO, q->node); if (!rdata) return NULL; for (i = 0; i < ROWQ_MAX_PRIO; i++) { INIT_LIST_HEAD(&rdata->row_queues[i].rqueue.fifo); rdata->row_queues[i].disp_quantum = queue_quantum[i]; rdata->row_queues[i].rqueue.rdata = rdata; rdata->row_queues[i].rqueue.prio = i; rdata->row_queues[i].rqueue.idle_data.begin_idling = false; rdata->row_queues[i].rqueue.idle_data.last_insert_time = ktime_set(0, 0); } /* * Currently idling is enabled only for READ queues. If we want to * enable it for write queues also, note that idling frequency will * be the same in both cases */ rdata->read_idle.idle_time = msecs_to_jiffies(ROW_IDLE_TIME_MSEC); /* Maybe 0 on some platforms */ if (!rdata->read_idle.idle_time) rdata->read_idle.idle_time = 1; rdata->read_idle.freq = ROW_READ_FREQ_MSEC; rdata->read_idle.idle_workqueue = alloc_workqueue("row_idle_work", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!rdata->read_idle.idle_workqueue) panic("Failed to create idle workqueue\n"); INIT_DELAYED_WORK(&rdata->read_idle.idle_work, kick_queue); rdata->curr_queue = ROWQ_PRIO_HIGH_READ; rdata->dispatch_queue = q; rdata->nr_reqs[READ] = rdata->nr_reqs[WRITE] = 0; return rdata; } /* * row_exit_queue() - called on unloading the RAW scheduler * @e: poiner to struct elevator_queue * */ static void row_exit_queue(struct elevator_queue *e) { struct row_data *rd = (struct row_data *)e->elevator_data; int i; for (i = 0; i < ROWQ_MAX_PRIO; i++) BUG_ON(!list_empty(&rd->row_queues[i].rqueue.fifo)); (void)cancel_delayed_work_sync(&rd->read_idle.idle_work); BUG_ON(delayed_work_pending(&rd->read_idle.idle_work)); destroy_workqueue(rd->read_idle.idle_workqueue); kfree(rd); } /* * row_merged_requests() - Called when 2 requests are merged * @q: requests queue * @rq: request the two requests were merged into * @next: request that was merged */ static void row_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct row_queue *rqueue = RQ_ROWQ(next); list_del_init(&next->queuelist); rqueue->rdata->nr_reqs[rq_data_dir(rq)]--; } /* * get_queue_type() - Get queue type for a given request * * This is a helping function which purpose is to determine what * ROW queue the given request should be added to (and * dispatched from leter on) * * TODO: Right now only 3 queues are used REG_READ, REG_WRITE * and REG_SWRITE */ static enum row_queue_prio get_queue_type(struct request *rq) { const int data_dir = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); if (data_dir == READ) return ROWQ_PRIO_REG_READ; else if (is_sync) return ROWQ_PRIO_REG_SWRITE; else return ROWQ_PRIO_REG_WRITE; } /* * row_set_request() - Set ROW data structures associated with this request. * @q: requests queue * @rq: pointer to the request * @gfp_mask: ignored * */ static int row_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) { struct row_data *rd = (struct row_data *)q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); rq->elevator_private[0] = (void *)(&rd->row_queues[get_queue_type(rq)]); spin_unlock_irqrestore(q->queue_lock, flags); return 0; } /********** Helping sysfs functions/defenitions for ROW attributes ******/ static ssize_t row_var_show(int var, char *page) { return snprintf(page, 100, "%d\n", var); } static ssize_t row_var_store(int *var, const char *page, size_t count) { int err; err = kstrtoul(page, 10, (unsigned long *)var); return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct row_data *rowd = e->elevator_data; \ int __data = __VAR; \ if (__CONV) \ __data = jiffies_to_msecs(__data); \ return row_var_show(__data, (page)); \ } SHOW_FUNCTION(row_hp_read_quantum_show, rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum, 0); SHOW_FUNCTION(row_rp_read_quantum_show, rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum, 0); SHOW_FUNCTION(row_hp_swrite_quantum_show, rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum, 0); SHOW_FUNCTION(row_rp_swrite_quantum_show, rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum, 0); SHOW_FUNCTION(row_rp_write_quantum_show, rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum, 0); SHOW_FUNCTION(row_lp_read_quantum_show, rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum, 0); SHOW_FUNCTION(row_lp_swrite_quantum_show, rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum, 0); SHOW_FUNCTION(row_read_idle_show, rowd->read_idle.idle_time, 1); SHOW_FUNCTION(row_read_idle_freq_show, rowd->read_idle.freq, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, \ const char *page, size_t count) \ { \ struct row_data *rowd = e->elevator_data; \ int __data; \ int ret = row_var_store(&__data, (page), count); \ if (__CONV) \ __data = (int)msecs_to_jiffies(__data); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __data; \ return ret; \ } STORE_FUNCTION(row_hp_read_quantum_store, &rowd->row_queues[ROWQ_PRIO_HIGH_READ].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_rp_read_quantum_store, &rowd->row_queues[ROWQ_PRIO_REG_READ].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_hp_swrite_quantum_store, &rowd->row_queues[ROWQ_PRIO_HIGH_SWRITE].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_rp_swrite_quantum_store, &rowd->row_queues[ROWQ_PRIO_REG_SWRITE].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_rp_write_quantum_store, &rowd->row_queues[ROWQ_PRIO_REG_WRITE].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_lp_read_quantum_store, &rowd->row_queues[ROWQ_PRIO_LOW_READ].disp_quantum, 1, INT_MAX, 0); STORE_FUNCTION(row_lp_swrite_quantum_store, &rowd->row_queues[ROWQ_PRIO_LOW_SWRITE].disp_quantum, 1, INT_MAX, 1); STORE_FUNCTION(row_read_idle_store, &rowd->read_idle.idle_time, 1, INT_MAX, 1); STORE_FUNCTION(row_read_idle_freq_store, &rowd->read_idle.freq, 1, INT_MAX, 0); #undef STORE_FUNCTION #define ROW_ATTR(name) \ __ATTR(name, S_IRUGO|S_IWUSR, row_##name##_show, \ row_##name##_store) static struct elv_fs_entry row_attrs[] = { ROW_ATTR(hp_read_quantum), ROW_ATTR(rp_read_quantum), ROW_ATTR(hp_swrite_quantum), ROW_ATTR(rp_swrite_quantum), ROW_ATTR(rp_write_quantum), ROW_ATTR(lp_read_quantum), ROW_ATTR(lp_swrite_quantum), ROW_ATTR(read_idle), ROW_ATTR(read_idle_freq), __ATTR_NULL }; static struct elevator_type iosched_row = { .ops = { .elevator_merge_req_fn = row_merged_requests, .elevator_dispatch_fn = row_dispatch_requests, .elevator_add_req_fn = row_add_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_set_req_fn = row_set_request, .elevator_init_fn = row_init_queue, .elevator_exit_fn = row_exit_queue, }, .elevator_attrs = row_attrs, .elevator_name = "row", .elevator_owner = THIS_MODULE, }; static int __init row_init(void) { elv_register(&iosched_row); return 0; } static void __exit row_exit(void) { elv_unregister(&iosched_row); } module_init(row_init); module_exit(row_exit); MODULE_LICENSE("GPLv2"); MODULE_DESCRIPTION("Read Over Write IO scheduler");