From b0c013e2928d3696ceb6401311dbc1d7fcccd6dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:30:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE The new ioctl EXT4_IOC_CLEAR_ES_CACHE will force an inode's extent status cache to be cleared out. This is intended for use for debugging. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..b22f24f1d365 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -649,6 +649,8 @@ enum { #define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR -- cgit v1.2.3 From 1ad3ea6e0a694b0486eb2cbe60378ad0fbf23642 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:31:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_GETSTATE The new ioctl EXT4_IOC_GETSTATE returns some of the dynamic state of an ext4 inode for debugging purposes. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b22f24f1d365..ee296797bcd2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -651,6 +651,7 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -664,6 +665,16 @@ enum { #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* -- cgit v1.2.3 From bb5835edcdf8bf78bbe51cff13e332c439bc0567 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:32:41 -0400 Subject: ext4: add new ioctl EXT4_IOC_GET_ES_CACHE For debugging reasons, it's useful to know the contents of the extent cache. Since the extent cache contains much of what is in the fiemap ioctl, use an fiemap-style interface to return this information. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ee296797bcd2..e2d8ad27f4d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -652,6 +652,7 @@ enum { /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -692,6 +693,12 @@ enum { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -3258,6 +3265,9 @@ extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); -- cgit v1.2.3 From cd2d99229dc96219547e6349841e1aad851c6acc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 12 Aug 2019 13:44:49 -0400 Subject: ext4: drop legacy pre-1970 encoding workaround Originally, support for expanded timestamps had a bug in that pre-1970 times were erroneously encoded as being in the the 24th century. This was fixed in commit a4dad1ae24f8 ("ext4: Fix handling of extended tv_sec") which landed in 4.4. Starting with 4.4, pre-1970 timestamps were correctly encoded, but for backwards compatibility those incorrectly encoded timestamps were mapped back to the pre-1970 dates. Given that backwards compatibility workaround has been around for 4 years, and given that running e2fsck from e2fsprogs 1.43.2 and later will offer to fix these timestamps (which has been released for 3 years), it's past time to drop the legacy workaround from the kernel. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2d8ad27f4d1..17cc2dc13174 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,21 +828,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time) static inline void ext4_decode_extra_time(struct timespec64 *time, __le32 extra) { - if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { - -#if 1 - /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. (This backwards compatibility may be removed - * at the discretion of the ext4 developers.) - */ - u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; - if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) - extra_bits = 0; - time->tv_sec += extra_bits << 32; -#else + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; -#endif - } time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } -- cgit v1.2.3 From 7963e5ac901251c7a3b36fe7c987623a3f309393 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 22 Aug 2019 23:00:32 -0400 Subject: ext4: treat buffers with write errors as containing valid data I got some errors when I repair an ext4 volume which stacked by an iscsi target: Entry 'test60' in / (2) has deleted/unused inode 73750. Clear? It can be reproduced when the network not good enough. When I debug this I found ext4 will read entry buffer from disk and the buffer is marked with write_io_error. If the buffer is marked with write_io_error, it means it already wroten to journal, and not checked out to disk. IOW, the journal is newer than the data in disk. If this journal record 'delete test60', it means the 'test60' still on the disk metadata. In this case, if we read the buffer from disk successfully and create file continue, the new journal record will overwrite the journal which record 'delete test60', then the entry corruptioned. So, use the buffer rather than read from disk if the buffer is marked with write_io_error. Signed-off-by: Zhang Xiaoxu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17cc2dc13174..2348be3d66b7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3344,6 +3344,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ -- cgit v1.2.3 From 8fcc3a580651cceb94a9f48e1914491400d5146b Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 22 Aug 2019 23:22:14 -0400 Subject: ext4: rework reserved cluster accounting when invalidating pages The goal of this patch is to remove two references to the buffer delay bit in ext4_da_page_release_reservation() as part of a larger effort to remove all such references from ext4. These two references are principally used to reduce the reserved block/cluster count when pages are invalidated as a result of truncating, punching holes, or collapsing a block range in a file. The entire function is removed and replaced with code in ext4_es_remove_extent() that reduces the reserved count as a side effect of removing a block range from delayed and not unwritten extents in the extent status tree as is done when truncating, punching holes, or collapsing ranges. The code is written to minimize the number of searches descending from rb tree roots for scalability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2348be3d66b7..0664c43cc9dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -284,6 +284,9 @@ struct ext4_io_submit { ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) -- cgit v1.2.3 From 7727ae52975d4f4ef7ff69ed8e6e25f6a4168158 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:13:24 -0400 Subject: ext4: fix potential use after free after remounting with noblock_validity Remount process will release system zone which was allocated before if "noblock_validity" is specified. If we mount an ext4 file system to two mountpoints with default mount options, and then remount one of them with "noblock_validity", it may trigger a use after free problem when someone accessing the other one. # mount /dev/sda foo # mount /dev/sda bar User access mountpoint "foo" | Remount mountpoint "bar" | ext4_map_blocks() | ext4_remount() check_block_validity() | ext4_setup_system_zone() ext4_data_block_valid() | ext4_release_system_zone() | free system_blks rb nodes access system_blks rb nodes | trigger use after free | This problem can also be reproduced by one mountpint, At the same time, add_system_zone() can get called during remount as well so there can be racing ext4_data_block_valid() reading the rbtree at the same time. This patch add RCU to protect system zone from releasing or building when doing a remount which inverse current "noblock_validity" mount option. It assign the rbtree after the whole tree was complete and do actual freeing after rcu grace period, avoid any intermediate state. Reported-by: syzbot+1e470567330b7ad711d5@syzkaller.appspotmail.com Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0664c43cc9dc..c35bb8d734df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -184,6 +184,14 @@ struct ext4_map_blocks { unsigned int m_flags; }; +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + /* * Flags for ext4_io_end->flags */ @@ -1431,7 +1439,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; + struct ext4_system_blocks __rcu *system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ -- cgit v1.2.3