From 9fa8cfe706f9c20067c042a064999d5825a35330 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:24:59 -0400
Subject: Btrfs: don't preallocate metadata blocks during btrfs_search_slot

In order to avoid doing expensive extent management with tree locks held,
btrfs_search_slot will preallocate tree blocks for use by COW without
any tree locks held.

A later commit moves all of the extent allocation work for COW into
a delayed update mechanism, and this preallocation will no longer be
required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d..3a37ba7a8d6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1838,7 +1838,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret, u64 prealloc_dest);
+		    struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
-- 
cgit v1.2.3


From 56bec294dea971335d4466b30f2d959f28f6e36d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:10:06 -0400
Subject: Btrfs: do extent allocation and reference count updates in the
 background

The extent allocation tree maintains a reference count and full
back reference information for every extent allocated in the
filesystem.  For subvolume and snapshot trees, every time
a block goes through COW, the new copy of the block adds a reference
on every block it points to.

If a btree node points to 150 leaves, then the COW code needs to go
and add backrefs on 150 different extents, which might be spread all
over the extent allocation tree.

These updates currently happen during btrfs_cow_block, and most COWs
happen during btrfs_search_slot.  btrfs_search_slot has locks held
on both the parent and the node we are COWing, and so we really want
to avoid IO during the COW if we can.

This commit adds an rbtree of pending reference count updates and extent
allocations.  The tree is ordered by byte number of the extent and byte number
of the parent for the back reference.  The tree allows us to:

1) Modify back references in something close to disk order, reducing seeks
2) Significantly reduce the number of modifications made as block pointers
are balanced around
3) Do all of the extent insertion and back reference modifications outside
of the performance critical btrfs_search_slot code.

#3 has the added benefit of greatly reducing the btrfs stack footprint.
The extent allocation tree modifications are done without the deep
(and somewhat recursive) call chains used in the past.

These delayed back reference updates must be done before the transaction
commits, and so the rbtree is tied to the transaction.  Throttling is
implemented to help keep the queue of backrefs at a reasonable size.

Since there was a similar mechanism in place for the extent tree
extents, that is removed and replaced by the delayed reference tree.

Yan Zheng <yan.zheng@oracle.com> helped review and fixup this code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3a37ba7a8d6..ced5fd85dc3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -688,8 +688,6 @@ struct btrfs_fs_info {
 	struct rb_root block_group_cache_tree;
 
 	struct extent_io_tree pinned_extents;
-	struct extent_io_tree pending_del;
-	struct extent_io_tree extent_ins;
 
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
@@ -717,7 +715,6 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex extent_ins_mutex;
 	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
@@ -1704,18 +1701,15 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
@@ -1777,7 +1771,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 root_objectid, u64 ref_generation,
 			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
+			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
 			    u64 owner_objectid);
-- 
cgit v1.2.3


From c3e69d58e86c3917ae4e9e31b4acf490a7cafe60 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 10:17:05 -0400
Subject: Btrfs: process the delayed reference queue in clusters

The delayed reference queue maintains pending operations that need to
be done to the extent allocation tree.  These are processed by
finding records in the tree that are not currently being processed one at
a time.

This is slow because it uses lots of time searching through the rbtree
and because it creates lock contention on the extent allocation tree
when lots of different procs are running delayed refs at the same time.

This commit changes things to grab a cluster of refs for processing,
using a cursor into the rbtree as the starting point of the next search.
This way we walk smoothly through the rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ced5fd85dc3..08d9f8d1553 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -720,6 +720,7 @@ struct btrfs_fs_info {
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
+
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
-- 
cgit v1.2.3


From b9473439d3e84d9fc1a0a83faca69cc1b7566341 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 13 Mar 2009 11:00:37 -0400
Subject: Btrfs: leave btree locks spinning more often

btrfs_mark_buffer dirty would set dirty bits in the extent_io tree
for the buffers it was dirtying.  This may require a kmalloc and it
was not atomic.  So, anyone who called btrfs_mark_buffer_dirty had to
set any btree locks they were holding to blocking first.

This commit changes dirty tracking for extent buffers to just use a flag
in the extent buffer.  Now that we have one and only one extent buffer
per page, this can be safely done without losing dirty bits along the way.

This also introduces a path->leave_spinning flag that callers of
btrfs_search_slot can use to indicate they will properly deal with a
path returned where all the locks are spinning instead of blocking.

Many of the btree search callers now expect spinning paths,
resulting in better btree concurrency overall.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 08d9f8d1553..4ddce91cf3f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -401,15 +401,16 @@ struct btrfs_path {
 	int locks[BTRFS_MAX_LEVEL];
 	int reada;
 	/* keep some upper locks as we walk down */
-	int keep_locks;
-	int skip_locking;
 	int lowest_level;
 
 	/*
 	 * set by btrfs_split_item, tells search_slot to keep all locks
 	 * and to force calls to keep space in the nodes
 	 */
-	int search_for_split;
+	unsigned int search_for_split:1;
+	unsigned int keep_locks:1;
+	unsigned int skip_locking:1;
+	unsigned int leave_spinning:1;
 };
 
 /*
@@ -779,6 +780,11 @@ struct btrfs_fs_info {
 	atomic_t throttle_gen;
 
 	u64 total_pinned;
+
+	/* protected by the delalloc lock, used to keep from writing
+	 * metadata until there is a nice batch
+	 */
+	u64 dirty_metadata_bytes;
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
-- 
cgit v1.2.3


From 12fcfd22fe5bf4fe74710232098bc101af497995 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Mar 2009 10:24:20 -0400
Subject: Btrfs: tree logging unlink/rename fixes

The tree logging code allows individual files or directories to be logged
without including operations on other files and directories in the FS.
It tries to commit the minimal set of changes to disk in order to
fsync the single file or directory that was sent to fsync or O_SYNC.

The tree logging code was allowing files and directories to be unlinked
if they were part of a rename operation where only one directory
in the rename was in the fsync log.  This patch adds a few new rules
to the tree logging.

1) on rename or unlink, if the inode being unlinked isn't in the fsync
log, we must force a full commit before doing an fsync of the directory
where the unlink was done.  The commit isn't done during the unlink,
but it is forced the next time we try to log the parent directory.

Solution: record transid of last unlink/rename per directory when the
directory wasn't already logged.  For renames this is only done when
renaming to a different directory.

mkdir foo/some_dir
normal commit
rename foo/some_dir foo2/some_dir
mkdir foo/some_dir
fsync foo/some_dir/some_file

The fsync above will unlink the original some_dir without recording
it in its new location (foo2).  After a crash, some_dir will be gone
unless the fsync of some_file forces a full commit

2) we must log any new names for any file or dir that is in the fsync
log.  This way we make sure not to lose files that are unlinked during
the same transaction.

2a) we must log any new names for any file or dir during rename
when the directory they are being removed from was logged.

2a is actually the more important variant.  Without the extra logging
a crash might unlink the old name without recreating the new one

3) after a crash, we must go through any directories with a link count
of zero and redo the rm -rf

mkdir f1/foo
normal commit
rm -rf f1/foo
fsync(f1)

The directory f1 was fully removed from the FS, but fsync was never
called on f1, only its parent dir.  After a crash the rm -rf must
be replayed.  This must be able to recurse down the entire
directory tree.  The inode link count fixup code takes care of the
ugly details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ddce91cf3f..2737facbd34 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -695,7 +695,12 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
-	u64 last_trans_new_blockgroup;
+
+	/*
+	 * this is updated to the current trans every time a full commit
+	 * is required instead of the faster short fsync log commits
+	 */
+	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
-- 
cgit v1.2.3


From 5a3f23d515a2ebf0c750db80579ca57b28cbce6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 31 Mar 2009 13:27:11 -0400
Subject: Btrfs: add extra flushing for renames and truncates

Renames and truncates are both common ways to replace old data with new
data.  The filesystem can make an effort to make sure the new data is
on disk before actually replacing the old data.

This is especially important for rename, which many application use as
though it were atomic for both the data and the metadata involved.  The
current btrfs code will happily replace a file that is fully on disk
with one that was just created and still has pending IO.

If we crash after transaction commit but before the IO is done, we'll end
up replacing a good file with a zero length file.  The solution used
here is to create a list of inodes that need special ordering and force
them to disk before the commit is done.  This is similar to the
ext3 style data=ordering, except it is only done on selected files.

Btrfs is able to get away with this because it does not wait on commits
very often, even for fsync (which use a sub-commit).

For renames, we order the file when it wasn't already
on disk and when it is replacing an existing file.  Larger files
are sent to filemap_flush right away (before the transaction handle is
opened).

For truncates, we order if the file goes from non-zero size down to
zero size.  This is a little different, because at the time of the
truncate the file has no dirty bytes to order.  But, we flag the inode
so that it is added to the ordered list on close (via release method).  We
also immediately add it to the ordered list of the current transaction
so that we can try to flush down any writes the application sneaks in
before commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'fs/btrfs/ctree.h')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2737facbd34..f48905ee524 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
 
+	/*
+	 * this protects the ordered operations list only while we are
+	 * processing all of the entries on it.  This way we make
+	 * sure the commit code doesn't find the list temporarily empty
+	 * because another function happens to be doing non-waiting preflush
+	 * before jumping into the main commit.
+	 */
+	struct mutex ordered_operations_mutex;
+
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -741,9 +757,28 @@ struct btrfs_fs_info {
 	 * ordered extents
 	 */
 	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
 	struct list_head ordered_extents;
+
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
 	struct list_head delalloc_inodes;
 
+	/*
+	 * special rename and truncate targets that must be on disk before
+	 * we're allowed to commit.  This is basically the ext3 style
+	 * data=ordered list.
+	 */
+	struct list_head ordered_operations;
+
 	/*
 	 * there is a pool of worker threads for checksumming during writes
 	 * and a pool for checksumming after reads.  This is because readers
-- 
cgit v1.2.3