From 2db938bee32e7469ca8ed9bfb3a05535f28c680d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Feb 2011 17:25:37 +0100 Subject: jbd: Refine commit writeout logic Currently we write out all journal buffers in WRITE_SYNC mode. This improves performance for fsync heavy workloads but hinders performance when writes are mostly asynchronous, most noticably it slows down readers and users complain about slow desktop response etc. So submit writes as asynchronous in the normal case and only submit writes as WRITE_SYNC if we detect someone is waiting for current transaction commit. I've gathered some numbers to back this change. The first is the read latency test. It measures time to read 1 MB after several seconds of sleeping in presence of streaming writes. Top 10 times (out of 90) in us: Before After 2131586 697473 1709932 557487 1564598 535642 1480462 347573 1478579 323153 1408496 222181 1388960 181273 1329565 181070 1252486 172832 1223265 172278 Average: 619377 82180 So the improvement in both maximum and average latency is massive. I've measured fsync throughput by: fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4 in presence of streaming reader. The numbers (fsyncs/s) are: Before After 9.9 6.3 6.8 6.0 6.3 6.2 5.8 6.1 So fsync performance seems unharmed by this change. Signed-off-by: Jan Kara --- include/trace/events/jbd.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h index aff64d82d713..9305e1b5edc3 100644 --- a/include/trace/events/jbd.h +++ b/include/trace/events/jbd.h @@ -36,19 +36,17 @@ DECLARE_EVENT_CLASS(jbd_commit, TP_STRUCT__entry( __field( dev_t, dev ) - __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; - __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %d,%d transaction %d sync %d", + TP_printk("dev %d,%d transaction %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit) + __entry->transaction) ); DEFINE_EVENT(jbd_commit, jbd_start_commit, @@ -87,19 +85,17 @@ TRACE_EVENT(jbd_drop_transaction, TP_STRUCT__entry( __field( dev_t, dev ) - __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; - __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %d,%d transaction %d sync %d", + TP_printk("dev %d,%d transaction %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit) + __entry->transaction) ); TRACE_EVENT(jbd_end_commit, @@ -109,21 +105,19 @@ TRACE_EVENT(jbd_end_commit, TP_STRUCT__entry( __field( dev_t, dev ) - __field( char, sync_commit ) __field( int, transaction ) __field( int, head ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; - __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %d,%d transaction %d sync %d head %d", + TP_printk("dev %d,%d transaction %d head %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit, __entry->head) + __entry->transaction, __entry->head) ); TRACE_EVENT(jbd_do_submit_data, @@ -133,19 +127,17 @@ TRACE_EVENT(jbd_do_submit_data, TP_STRUCT__entry( __field( dev_t, dev ) - __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; - __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %d,%d transaction %d sync %d", + TP_printk("dev %d,%d transaction %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->transaction, __entry->sync_commit) + __entry->transaction) ); TRACE_EVENT(jbd_cleanup_journal_tail, -- cgit v1.2.3 From 9754e39c7bc51328f145e933bfb0df47cd67b6e9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 12:33:03 +0200 Subject: jbd: Split updating of journal superblock and marking journal empty There are three case of updating journal superblock. In the first case, we want to mark journal as empty (setting s_sequence to 0), in the second case we want to update log tail, in the third case we want to update s_errno. Split these cases into separate functions. It makes the code slightly more straightforward and later patches will make the distinction even more important. Signed-off-by: Jan Kara --- include/trace/events/jbd.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h index 9305e1b5edc3..d9658a940a39 100644 --- a/include/trace/events/jbd.h +++ b/include/trace/events/jbd.h @@ -169,24 +169,20 @@ TRACE_EVENT(jbd_cleanup_journal_tail, __entry->block_nr, __entry->freed) ); -TRACE_EVENT(jbd_update_superblock_end, - TP_PROTO(journal_t *journal, int wait), +TRACE_EVENT(journal_write_superblock, + TP_PROTO(journal_t *journal), - TP_ARGS(journal, wait), + TP_ARGS(journal), TP_STRUCT__entry( __field( dev_t, dev ) - __field( int, wait ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; - __entry->wait = wait; ), - TP_printk("dev %d,%d wait %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->wait) + TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); #endif /* _TRACE_JBD_H */ -- cgit v1.2.3 From fd2cbd4dfa3db477dd6226d387d3f1911d36a6a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 11:05:19 +0200 Subject: jbd: Write journal superblock with WRITE_FUA after checkpointing If journal superblock is written only in disk's caches and other transaction starts reusing space of the transaction cleaned from the log, it can happen blocks of a new transaction reach the disk before journal superblock. When power failure happens in such case, subsequent journal replay would still try to replay the old transaction but some of it's blocks may be already overwritten by the new transaction. For this reason we must use WRITE_FUA when updating log tail and we must first write new log tail to disk and update in-memory information only after that. Signed-off-by: Jan Kara --- include/trace/events/jbd.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h index d9658a940a39..da6f2591c25e 100644 --- a/include/trace/events/jbd.h +++ b/include/trace/events/jbd.h @@ -170,19 +170,22 @@ TRACE_EVENT(jbd_cleanup_journal_tail, ); TRACE_EVENT(journal_write_superblock, - TP_PROTO(journal_t *journal), + TP_PROTO(journal_t *journal, int write_op), - TP_ARGS(journal), + TP_ARGS(journal, write_op), TP_STRUCT__entry( __field( dev_t, dev ) + __field( int, write_op ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; + __entry->write_op = write_op; ), - TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->write_op) ); #endif /* _TRACE_JBD_H */ -- cgit v1.2.3