aboutsummaryrefslogtreecommitdiffstats
path: root/block/bfq.h
blob: ad770af05ddabb0a5ebb93422197f1199b59c755 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
/*
 * BFQ-v7r8 for 3.3.0: data structures and common functions prototypes.
 *
 * Based on ideas and code from CFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
 */

#ifndef _BFQ_H
#define _BFQ_H

#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
#include <linux/ioprio.h>
#include <linux/rbtree.h>

#define BFQ_IOPRIO_CLASSES	3
#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)

#define BFQ_MIN_WEIGHT	1
#define BFQ_MAX_WEIGHT	1000

#define BFQ_DEFAULT_QUEUE_IOPRIO	4

#define BFQ_DEFAULT_GRP_WEIGHT	10
#define BFQ_DEFAULT_GRP_IOPRIO	0
#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE

struct bfq_entity;

/**
 * struct bfq_service_tree - per ioprio_class service tree.
 * @active: tree for active entities (i.e., those backlogged).
 * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
 * @first_idle: idle entity with minimum F_i.
 * @last_idle: idle entity with maximum F_i.
 * @vtime: scheduler virtual time.
 * @wsum: scheduler weight sum; active and idle entities contribute to it.
 *
 * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
 * ioprio_class has its own independent scheduler, and so its own
 * bfq_service_tree.  All the fields are protected by the queue lock
 * of the containing bfqd.
 */
struct bfq_service_tree {
	struct rb_root active;
	struct rb_root idle;

	struct bfq_entity *first_idle;
	struct bfq_entity *last_idle;

	u64 vtime;
	unsigned long wsum;
};

/**
 * struct bfq_sched_data - multi-class scheduler.
 * @in_service_entity: entity in service.
 * @next_in_service: head-of-the-line entity in the scheduler.
 * @service_tree: array of service trees, one per ioprio_class.
 *
 * bfq_sched_data is the basic scheduler queue.  It supports three
 * ioprio_classes, and can be used either as a toplevel queue or as
 * an intermediate queue on a hierarchical setup.
 * @next_in_service points to the active entity of the sched_data
 * service trees that will be scheduled next.
 *
 * The supported ioprio_classes are the same as in CFQ, in descending
 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
 * Requests from higher priority queues are served before all the
 * requests from lower priority queues; among requests of the same
 * queue requests are served according to B-WF2Q+.
 * All the fields are protected by the queue lock of the containing bfqd.
 */
struct bfq_sched_data {
	struct bfq_entity *in_service_entity;
	struct bfq_entity *next_in_service;
	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
};

/**
 * struct bfq_weight_counter - counter of the number of all active entities
 *                             with a given weight.
 * @weight: weight of the entities that this counter refers to.
 * @num_active: number of active entities with this weight.
 * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
 *                and @group_weights_tree).
 */
struct bfq_weight_counter {
	short int weight;
	unsigned int num_active;
	struct rb_node weights_node;
};

/**
 * struct bfq_entity - schedulable entity.
 * @rb_node: service_tree member.
 * @weight_counter: pointer to the weight counter associated with this entity.
 * @on_st: flag, true if the entity is on a tree (either the active or
 *         the idle one of its service_tree).
 * @finish: B-WF2Q+ finish timestamp (aka F_i).
 * @start: B-WF2Q+ start timestamp (aka S_i).
 * @tree: tree the entity is enqueued into; %NULL if not on a tree.
 * @min_start: minimum start time of the (active) subtree rooted at
 *             this entity; used for O(log N) lookups into active trees.
 * @service: service received during the last round of service.
 * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
 * @weight: weight of the queue
 * @parent: parent entity, for hierarchical scheduling.
 * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
 *                 associated scheduler queue, %NULL on leaf nodes.
 * @sched_data: the scheduler queue this entity belongs to.
 * @ioprio: the ioprio in use.
 * @new_weight: when a weight change is requested, the new weight value.
 * @orig_weight: original weight, used to implement weight boosting
 * @new_ioprio: when an ioprio change is requested, the new ioprio value.
 * @ioprio_class: the ioprio_class in use.
 * @new_ioprio_class: when an ioprio_class change is requested, the new
 *                    ioprio_class value.
 * @ioprio_changed: flag, true when the user requested a weight, ioprio or
 *                  ioprio_class change.
 *
 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
 * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
 * entity belongs to the sched_data of the parent group in the cgroup
 * hierarchy.  Non-leaf entities have also their own sched_data, stored
 * in @my_sched_data.
 *
 * Each entity stores independently its priority values; this would
 * allow different weights on different devices, but this
 * functionality is not exported to userspace by now.  Priorities and
 * weights are updated lazily, first storing the new values into the
 * new_* fields, then setting the @ioprio_changed flag.  As soon as
 * there is a transition in the entity state that allows the priority
 * update to take place the effective and the requested priority
 * values are synchronized.
 *
 * Unless cgroups are used, the weight value is calculated from the
 * ioprio to export the same interface as CFQ.  When dealing with
 * ``well-behaved'' queues (i.e., queues that do not spend too much
 * time to consume their budget and have true sequential behavior, and
 * when there are no external factors breaking anticipation) the
 * relative weights at each level of the cgroups hierarchy should be
 * guaranteed.  All the fields are protected by the queue lock of the
 * containing bfqd.
 */
struct bfq_entity {
	struct rb_node rb_node;
	struct bfq_weight_counter *weight_counter;

	int on_st;

	u64 finish;
	u64 start;

	struct rb_root *tree;

	u64 min_start;

	unsigned long service, budget;
	unsigned short weight, new_weight;
	unsigned short orig_weight;

	struct bfq_entity *parent;

	struct bfq_sched_data *my_sched_data;
	struct bfq_sched_data *sched_data;

	unsigned short ioprio, new_ioprio;
	unsigned short ioprio_class, new_ioprio_class;

	int ioprio_changed;
};

struct bfq_group;

/**
 * struct bfq_queue - leaf schedulable entity.
 * @ref: reference counter.
 * @bfqd: parent bfq_data.
 * @new_bfqq: shared bfq_queue if queue is cooperating with
 *           one or more other queues.
 * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
 * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
 * @sort_list: sorted list of pending requests.
 * @next_rq: if fifo isn't expired, next request to serve.
 * @queued: nr of requests queued in @sort_list.
 * @allocated: currently allocated requests.
 * @meta_pending: pending metadata requests.
 * @fifo: fifo list of requests in sort_list.
 * @entity: entity representing this queue in the scheduler.
 * @max_budget: maximum budget allowed from the feedback mechanism.
 * @budget_timeout: budget expiration (in jiffies).
 * @dispatched: number of requests on the dispatch list or inside driver.
 * @flags: status flags.
 * @bfqq_list: node for active/idle bfqq list inside our bfqd.
 * @burst_list_node: node for the device's burst list.
 * @seek_samples: number of seeks sampled
 * @seek_total: sum of the distances of the seeks sampled
 * @seek_mean: mean seek distance
 * @last_request_pos: position of the last request enqueued
 * @requests_within_timer: number of consecutive pairs of request completion
 *                         and arrival, such that the queue becomes idle
 *                         after the completion, but the next request arrives
 *                         within an idle time slice; used only if the queue's
 *                         IO_bound has been cleared.
 * @pid: pid of the process owning the queue, used for logging purposes.
 * @last_wr_start_finish: start time of the current weight-raising period if
 *                        the @bfq-queue is being weight-raised, otherwise
 *                        finish time of the last weight-raising period
 * @wr_cur_max_time: current max raising time for this queue
 * @soft_rt_next_start: minimum time instant such that, only if a new
 *                      request is enqueued after this time instant in an
 *                      idle @bfq_queue with no outstanding requests, then
 *                      the task associated with the queue it is deemed as
 *                      soft real-time (see the comments to the function
 *                      bfq_bfqq_softrt_next_start())
 * @last_idle_bklogged: time of the last transition of the @bfq_queue from
 *                      idle to backlogged
 * @service_from_backlogged: cumulative service received from the @bfq_queue
 *                           since the last transition from idle to
 *                           backlogged
 * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
 *	 queue is shared
 *
 * A bfq_queue is a leaf request queue; it can be associated with an
 * io_context or more, if it  is  async or shared  between  cooperating
 * processes. @cgroup holds a reference to the cgroup, to be sure that it
 * does not disappear while a bfqq still references it (mostly to avoid
 * races between request issuing and task migration followed by cgroup
 * destruction).
 * All the fields are protected by the queue lock of the containing bfqd.
 */
struct bfq_queue {
	atomic_t ref;
	struct bfq_data *bfqd;

	/* fields for cooperating queues handling */
	struct bfq_queue *new_bfqq;
	struct rb_node pos_node;
	struct rb_root *pos_root;

	struct rb_root sort_list;
	struct request *next_rq;
	int queued[2];
	int allocated[2];
	int meta_pending;
	struct list_head fifo;

	struct bfq_entity entity;

	unsigned long max_budget;
	unsigned long budget_timeout;

	int dispatched;

	unsigned int flags;

	struct list_head bfqq_list;

	struct hlist_node burst_list_node;

	unsigned int seek_samples;
	u64 seek_total;
	sector_t seek_mean;
	sector_t last_request_pos;

	unsigned int requests_within_timer;

	pid_t pid;
	struct bfq_io_cq *bic;

	/* weight-raising fields */
	unsigned long wr_cur_max_time;
	unsigned long soft_rt_next_start;
	unsigned long last_wr_start_finish;
	unsigned int wr_coeff;
	unsigned long last_idle_bklogged;
	unsigned long service_from_backlogged;
};

/**
 * struct bfq_ttime - per process thinktime stats.
 * @ttime_total: total process thinktime
 * @ttime_samples: number of thinktime samples
 * @ttime_mean: average process thinktime
 */
struct bfq_ttime {
	unsigned long last_end_request;

	unsigned long ttime_total;
	unsigned long ttime_samples;
	unsigned long ttime_mean;
};

/**
 * struct bfq_io_cq - per (request_queue, io_context) structure.
 * @icq: associated io_cq structure
 * @bfqq: array of two process queues, the sync and the async
 * @ttime: associated @bfq_ttime struct
 * @wr_time_left: snapshot of the time left before weight raising ends
 *                for the sync queue associated to this process; this
 *		  snapshot is taken to remember this value while the weight
 *		  raising is suspended because the queue is merged with a
 *		  shared queue, and is used to set @raising_cur_max_time
 *		  when the queue is split from the shared queue and its
 *		  weight is raised again
 * @saved_idle_window: same purpose as the previous field for the idle
 *                     window
 * @saved_IO_bound: same purpose as the previous two fields for the I/O
 *                  bound classification of a queue
 * @saved_in_large_burst: same purpose as the previous fields for the
 *                        value of the field keeping the queue's belonging
 *                        to a large burst
 * @was_in_burst_list: true if the queue belonged to a burst list
 *                     before its merge with another cooperating queue
 * @cooperations: counter of consecutive successful queue merges underwent
 *                by any of the process' @bfq_queues
 * @failed_cooperations: counter of consecutive failed queue merges of any
 *                       of the process' @bfq_queues
 */
struct bfq_io_cq {
	struct io_cq icq; /* must be the first member */
	struct bfq_queue *bfqq[2];
	struct bfq_ttime ttime;
	int ioprio;

	unsigned int wr_time_left;
	bool saved_idle_window;
	bool saved_IO_bound;

	bool saved_in_large_burst;
	bool was_in_burst_list;

	unsigned int cooperations;
	unsigned int failed_cooperations;
};

enum bfq_device_speed {
	BFQ_BFQD_FAST,
	BFQ_BFQD_SLOW,
};

/**
 * struct bfq_data - per device data structure.
 * @queue: request queue for the managed device.
 * @root_group: root bfq_group for the device.
 * @rq_pos_tree: rbtree sorted by next_request position, used when
 *               determining if two or more queues have interleaving
 *               requests (see bfq_close_cooperator()).
 * @active_numerous_groups: number of bfq_groups containing more than one
 *                          active @bfq_entity.
 * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
 *                      weight. Used to keep track of whether all @bfq_queues
 *                     have the same weight. The tree contains one counter
 *                     for each distinct weight associated to some active
 *                     and not weight-raised @bfq_queue (see the comments to
 *                      the functions bfq_weights_tree_[add|remove] for
 *                     further details).
 * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
 *                      by weight. Used to keep track of whether all
 *                     @bfq_groups have the same weight. The tree contains
 *                     one counter for each distinct weight associated to
 *                     some active @bfq_group (see the comments to the
 *                     functions bfq_weights_tree_[add|remove] for further
 *                     details).
 * @busy_queues: number of bfq_queues containing requests (including the
 *		 queue in service, even if it is idling).
 * @busy_in_flight_queues: number of @bfq_queues containing pending or
 *                         in-flight requests, plus the @bfq_queue in
 *                         service, even if idle but waiting for the
 *                         possible arrival of its next sync request. This
 *                         field is updated only if the device is rotational,
 *                         but used only if the device is also NCQ-capable.
 *                         The reason why the field is updated also for non-
 *                         NCQ-capable rotational devices is related to the
 *                         fact that the value of @hw_tag may be set also
 *                         later than when busy_in_flight_queues may need to
 *                         be incremented for the first time(s). Taking also
 *                         this possibility into account, to avoid unbalanced
 *                         increments/decrements, would imply more overhead
 *                         than just updating busy_in_flight_queues
 *                         regardless of the value of @hw_tag.
 * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
 *                                     (that is, seeky queues that expired
 *                                     for budget timeout at least once)
 *                                     containing pending or in-flight
 *                                     requests, including the in-service
 *                                     @bfq_queue if constantly seeky. This
 *                                     field is updated only if the device
 *                                     is rotational, but used only if the
 *                                     device is also NCQ-capable (see the
 *                                     comments to @busy_in_flight_queues).
 * @wr_busy_queues: number of weight-raised busy @bfq_queues.
 * @queued: number of queued requests.
 * @rq_in_driver: number of requests dispatched and waiting for completion.
 * @sync_flight: number of sync requests in the driver.
 * @max_rq_in_driver: max number of reqs in driver in the last
 *                    @hw_tag_samples completed requests.
 * @hw_tag_samples: nr of samples used to calculate hw_tag.
 * @hw_tag: flag set to one if the driver is showing a queueing behavior.
 * @budgets_assigned: number of budgets assigned.
 * @idle_slice_timer: timer set when idling for the next sequential request
 *                    from the queue in service.
 * @unplug_work: delayed work to restart dispatching on the request queue.
 * @in_service_queue: bfq_queue in service.
 * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
 * @last_position: on-disk position of the last served request.
 * @last_budget_start: beginning of the last budget.
 * @last_idling_start: beginning of the last idle slice.
 * @peak_rate: peak transfer rate observed for a budget.
 * @peak_rate_samples: number of samples used to calculate @peak_rate.
 * @bfq_max_budget: maximum budget allotted to a bfq_queue before
 *                  rescheduling.
 * @group_list: list of all the bfq_groups active on the device.
 * @active_list: list of all the bfq_queues active on the device.
 * @idle_list: list of all the bfq_queues idle on the device.
 * @bfq_fifo_expire: timeout for async/sync requests; when it expires
 *                   requests are served in fifo order.
 * @bfq_back_penalty: weight of backward seeks wrt forward ones.
 * @bfq_back_max: maximum allowed backward seek.
 * @bfq_slice_idle: maximum idling time.
 * @bfq_user_max_budget: user-configured max budget value
 *                       (0 for auto-tuning).
 * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
 *                           async queues.
 * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
 *               to prevent seeky queues to impose long latencies to well
 *               behaved ones (this also implies that seeky queues cannot
 *               receive guarantees in the service domain; after a timeout
 *               they are charged for the whole allocated budget, to try
 *               to preserve a behavior reasonably fair among them, but
 *               without service-domain guarantees).
 * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
 *                   no more granted any weight-raising.
 * @bfq_failed_cooperations: number of consecutive failed cooperation
 *                           chances after which weight-raising is restored
 *                           to a queue subject to more than bfq_coop_thresh
 *                           queue merges.
 * @bfq_requests_within_timer: number of consecutive requests that must be
 *                             issued within the idle time slice to set
 *                             again idling to a queue which was marked as
 *                             non-I/O-bound (see the definition of the
 *                             IO_bound flag for further details).
 * @last_ins_in_burst: last time at which a queue entered the current
 *                     burst of queues being activated shortly after
 *                     each other; for more details about this and the
 *                     following parameters related to a burst of
 *                     activations, see the comments to the function
 *                     @bfq_handle_burst.
 * @bfq_burst_interval: reference time interval used to decide whether a
 *                      queue has been activated shortly after
 *                      @last_ins_in_burst.
 * @burst_size: number of queues in the current burst of queue activations.
 * @bfq_large_burst_thresh: maximum burst size above which the current
 * 			    queue-activation burst is deemed as 'large'.
 * @large_burst: true if a large queue-activation burst is in progress.
 * @burst_list: head of the burst list (as for the above fields, more details
 * 		in the comments to the function bfq_handle_burst).
 * @low_latency: if set to true, low-latency heuristics are enabled.
 * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
 *                queue is multiplied.
 * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).
 * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.
 * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
 *			  may be reactivated for a queue (in jiffies).
 * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
 *				after which weight-raising may be
 *				reactivated for an already busy queue
 *				(in jiffies).
 * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
 *			    sectors per seconds.
 * @RT_prod: cached value of the product R*T used for computing the maximum
 *	     duration of the weight raising automatically.
 * @device_speed: device-speed class for the low-latency heuristic.
 * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.
 *
 * All the fields are protected by the @queue lock.
 */
struct bfq_data {
	struct request_queue *queue;

	struct bfq_group *root_group;
	struct rb_root rq_pos_tree;

#ifdef CONFIG_CGROUP_BFQIO
	int active_numerous_groups;
#endif

	struct rb_root queue_weights_tree;
	struct rb_root group_weights_tree;

	int busy_queues;
	int busy_in_flight_queues;
	int const_seeky_busy_in_flight_queues;
	int wr_busy_queues;
	int queued;
	int rq_in_driver;
	int sync_flight;

	int max_rq_in_driver;
	int hw_tag_samples;
	int hw_tag;

	int budgets_assigned;

	struct timer_list idle_slice_timer;
	struct work_struct unplug_work;

	struct bfq_queue *in_service_queue;
	struct bfq_io_cq *in_service_bic;

	sector_t last_position;

	ktime_t last_budget_start;
	ktime_t last_idling_start;
	int peak_rate_samples;
	u64 peak_rate;
	unsigned long bfq_max_budget;

	struct hlist_head group_list;
	struct list_head active_list;
	struct list_head idle_list;

	unsigned int bfq_fifo_expire[2];
	unsigned int bfq_back_penalty;
	unsigned int bfq_back_max;
	unsigned int bfq_slice_idle;
	u64 bfq_class_idle_last_service;

	unsigned int bfq_user_max_budget;
	unsigned int bfq_max_budget_async_rq;
	unsigned int bfq_timeout[2];

	unsigned int bfq_coop_thresh;
	unsigned int bfq_failed_cooperations;
	unsigned int bfq_requests_within_timer;

	unsigned long last_ins_in_burst;
	unsigned long bfq_burst_interval;
	int burst_size;
	unsigned long bfq_large_burst_thresh;
	bool large_burst;
	struct hlist_head burst_list;

	bool low_latency;

	/* parameters of the low_latency heuristics */
	unsigned int bfq_wr_coeff;
	unsigned int bfq_wr_max_time;
	unsigned int bfq_wr_rt_max_time;
	unsigned int bfq_wr_min_idle_time;
	unsigned long bfq_wr_min_inter_arr_async;
	unsigned int bfq_wr_max_softrt_rate;
	u64 RT_prod;
	enum bfq_device_speed device_speed;

	struct bfq_queue oom_bfqq;
};

enum bfqq_state_flags {
	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */
	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
	BFQ_BFQQ_FLAG_IO_bound,		/*
					 * bfqq has timed-out at least once
					 * having consumed at most 2/10 of
					 * its budget
					 */
	BFQ_BFQQ_FLAG_in_large_burst,	/*
					 * bfqq activated in a large burst,
					 * see comments to bfq_handle_burst.
					 */
	BFQ_BFQQ_FLAG_constantly_seeky,	/*
					 * bfqq has proved to be slow and
					 * seeky until budget timeout
					 */
	BFQ_BFQQ_FLAG_softrt_update,	/*
					 * may need softrt-next-start
					 * update
					 */
	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */
	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */
};

#define BFQ_BFQQ_FNS(name)						\
static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
{									\
	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
}									\
static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
{									\
	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
}									\
static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
{									\
	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
}

BFQ_BFQQ_FNS(busy);
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(must_alloc);
BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(budget_new);
BFQ_BFQQ_FNS(IO_bound);
BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(constantly_seeky);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(just_split);
BFQ_BFQQ_FNS(softrt_update);
#undef BFQ_BFQQ_FNS

/* Logging facilities. */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)

#define bfq_log(bfqd, fmt, args...) \
	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)

/* Expiration reasons. */
enum bfqq_expiration {
	BFQ_BFQQ_TOO_IDLE = 0,		/*
					 * queue has been idling for
					 * too long
					 */
	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
};

#ifdef CONFIG_CGROUP_BFQIO
/**
 * struct bfq_group - per (device, cgroup) data structure.
 * @entity: schedulable entity to insert into the parent group sched_data.
 * @sched_data: own sched_data, to contain child entities (they may be
 *              both bfq_queues and bfq_groups).
 * @group_node: node to be inserted into the bfqio_cgroup->group_data
 *              list of the containing cgroup's bfqio_cgroup.
 * @bfqd_node: node to be inserted into the @bfqd->group_list list
 *             of the groups active on the same device; used for cleanup.
 * @bfqd: the bfq_data for the device this group acts upon.
 * @async_bfqq: array of async queues for all the tasks belonging to
 *              the group, one queue per ioprio value per ioprio_class,
 *              except for the idle class that has only one queue.
 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
 *             to avoid too many special cases during group creation/
 *             migration.
 * @active_entities: number of active entities belonging to the group;
 *                   unused for the root group. Used to know whether there
 *                   are groups with more than one active @bfq_entity
 *                   (see the comments to the function
 *                   bfq_bfqq_must_not_expire()).
 *
 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
 * there is a set of bfq_groups, each one collecting the lower-level
 * entities belonging to the group that are acting on the same device.
 *
 * Locking works as follows:
 *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
 *      via RCU from its readers.
 *    o @bfqd is protected by the queue lock, RCU is used to access it
 *      from the readers.
 *    o All the other fields are protected by the @bfqd queue lock.
 */
struct bfq_group {
	struct bfq_entity entity;
	struct bfq_sched_data sched_data;

	struct hlist_node group_node;
	struct hlist_node bfqd_node;

	void *bfqd;

	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
	struct bfq_queue *async_idle_bfqq;

	struct bfq_entity *my_entity;

	int active_entities;
};

/**
 * struct bfqio_cgroup - bfq cgroup data structure.
 * @css: subsystem state for bfq in the containing cgroup.
 * @weight: cgroup weight.
 * @ioprio: cgroup ioprio.
 * @ioprio_class: cgroup ioprio_class.
 * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
 * @group_data: list containing the bfq_group belonging to this cgroup.
 *
 * @group_data is accessed using RCU, with @lock protecting the updates,
 * @ioprio and @ioprio_class are protected by @lock.
 */
struct bfqio_cgroup {
	struct cgroup_subsys_state css;

	unsigned short weight, ioprio, ioprio_class;

	spinlock_t lock;
	struct hlist_head group_data;
};
#else
struct bfq_group {
	struct bfq_sched_data sched_data;

	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
	struct bfq_queue *async_idle_bfqq;
};
#endif

static inline struct bfq_service_tree *
bfq_entity_service_tree(struct bfq_entity *entity)
{
	struct bfq_sched_data *sched_data = entity->sched_data;
	unsigned int idx = entity->ioprio_class - 1;

	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
	BUG_ON(sched_data == NULL);

	return sched_data->service_tree + idx;
}

static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
					    bool is_sync)
{
	return bic->bfqq[is_sync];
}

static inline void bic_set_bfqq(struct bfq_io_cq *bic,
				struct bfq_queue *bfqq, bool is_sync)
{
	bic->bfqq[is_sync] = bfqq;
}

static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
{
	return bic->icq.q->elevator->elevator_data;
}

/**
 * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
 * @ptr: a pointer to a bfqd.
 * @flags: storage for the flags to be saved.
 *
 * This function allows bfqg->bfqd to be protected by the
 * queue lock of the bfqd they reference; the pointer is dereferenced
 * under RCU, so the storage for bfqd is assured to be safe as long
 * as the RCU read side critical section does not end.  After the
 * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
 * sure that no other writer accessed it.  If we raced with a writer,
 * the function returns NULL, with the queue unlocked, otherwise it
 * returns the dereferenced pointer, with the queue locked.
 */
static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
						   unsigned long *flags)
{
	struct bfq_data *bfqd;

	rcu_read_lock();
	bfqd = rcu_dereference(*(struct bfq_data **)ptr);

	if (bfqd != NULL) {
		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
		if (*ptr == bfqd)
			goto out;
		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
	}

	bfqd = NULL;
out:
	rcu_read_unlock();
	return bfqd;
}

static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
				       unsigned long *flags)
{
	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
}

static void bfq_check_ioprio_change(struct io_context *ioc,
				    struct bfq_io_cq *bic);
static void bfq_put_queue(struct bfq_queue *bfqq);
static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
				       struct bfq_group *bfqg, int is_sync,
				       struct io_context *ioc, gfp_t gfp_mask);
static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
				    struct bfq_group *bfqg);
static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);

#endif /* _BFQ_H */