From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:00:13 -0400 Subject: Take fs_struct handling to new file (fs/fs_struct.c) Pure code move; two new helper functions for nfsd and daemonize (unshare_fs_struct() and daemonize_fs_struct() resp.; for now - the same code as used to be in callers). unshare_fs_struct() exported (for nfsd, as copy_fs_struct()/exit_fs() used to be), copy_fs_struct() and exit_fs() don't need exports anymore. Signed-off-by: Al Viro --- include/linux/fs_struct.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 18b467dbe27..298cef1c079 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void put_fs_struct(struct fs_struct *); +extern void daemonize_fs_struct(void); +extern int unshare_fs_struct(void); #endif /* _LINUX_FS_STRUCT_H */ -- cgit v1.2.3 From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Mar 2009 07:20:30 -0400 Subject: New locking/refcounting for fs_struct * all changes of current->fs are done under task_lock and write_lock of old fs->lock * refcount is not atomic anymore (same protection) * its decrements are done when removing reference from current; at the same time we decide whether to free it. * put_fs_struct() is gone * new field - ->in_exec. Set by check_unsafe_exec() if we are trying to do execve() and only subthreads share fs_struct. Cleared when finishing exec (success and failure alike). Makes CLONE_FS fail with -EAGAIN if set. * check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread is in progress. Signed-off-by: Al Viro --- include/linux/fs_struct.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 298cef1c079..78a05bfcd8e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -4,12 +4,10 @@ #include struct fs_struct { - atomic_t count; /* This usage count is used by check_unsafe_exec() for - * security checking purposes - therefore it may not be - * incremented, except by clone(CLONE_FS). - */ + int users; rwlock_t lock; int umask; + int in_exec; struct path root, pwd; }; @@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); -extern void put_fs_struct(struct fs_struct *); +extern void free_fs_struct(struct fs_struct *); extern void daemonize_fs_struct(void); extern int unshare_fs_struct(void); -- cgit v1.2.3 From ce3b0f8d5c2203301fc87f3aaaed73e5819e2a48 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:08:22 -0400 Subject: New helper - current_umask() current->fs->umask is what most of fs_struct users are doing. Put that into a helper function. Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd..3d7bd5447ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1741,6 +1741,8 @@ extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int current_umask(void); + /* /sys/fs */ extern struct kobject *fs_kobj; -- cgit v1.2.3 From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:50:06 -0400 Subject: Get rid of indirect include of fs_struct.h Don't pull it in sched.h; very few files actually need it and those can include directly. sched.h itself only needs forward declaration of struct fs_struct; Signed-off-by: Al Viro --- include/linux/mnt_namespace.h | 2 ++ include/linux/nsproxy.h | 1 + include/linux/sched.h | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 830bbcd449d..3a059298cc1 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -22,6 +22,8 @@ struct proc_mounts { int event; }; +struct fs_struct; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void __put_mnt_ns(struct mnt_namespace *ns); diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index afad7dec1b3..7b370c7cfef 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -8,6 +8,7 @@ struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; +struct fs_struct; /* * A structure to contain pointers to all per-process diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2d..b4e065ea0de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,7 +68,7 @@ struct sched_param { #include #include #include -#include +#include #include #include #include @@ -97,6 +97,7 @@ struct futex_pi_state; struct robust_list_head; struct bio; struct bts_tracer; +struct fs_struct; /* * List of flags we want to share for kernel threads, -- cgit v1.2.3 From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Apr 2009 07:07:16 -0400 Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225 fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had been left behind Signed-off-by: Al Viro --- include/linux/buffer_head.h | 12 ------------ include/linux/fs.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea..fc91665d39d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -332,22 +332,10 @@ extern int __set_page_dirty_buffers(struct page *page); static inline void buffer_init(void) {} static inline int try_to_free_buffers(struct page *page) { return 1; } -static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bdev(struct block_device *bdev) {} - -static inline struct super_block *freeze_bdev(struct block_device *sb) -{ - return NULL; -} - -static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d7bd5447ca..67413472559 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1886,6 +1886,18 @@ extern int fsync_super(struct super_block *); extern int fsync_no_super(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} +static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline void invalidate_bdev(struct block_device *bdev) {} + +static inline struct super_block *freeze_bdev(struct block_device *sb) +{ + return NULL; +} + +static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + return 0; +} #endif extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; -- cgit v1.2.3 From ced117c73edc917e96dea7cca98c91383f0792f7 Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Tue, 31 Mar 2009 00:41:20 +0300 Subject: Remove two unneeded exports and make two symbols static in fs/mpage.c Commit 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (vfs: add hooks for ext4's delayed allocation support) exported the following functions mpage_bio_submit() __mpage_writepage() for the benefit of ext4's delayed allocation support. Since commit a1d6cc563bfdf1bf2829d3e6ce4d8b774251796b (ext4: Rework the ext4_da_writepages() function), these functions are not used by the ext4 driver anymore. However, the now unnecessary exports still remain, and this patch removes those. Moreover, these two functions can become static again. The issue was spotted by namespacecheck. Signed-off-by: Dmitri Vorobiev Reviewed-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- include/linux/mpage.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 5c42821da2d..068a0c9946a 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -11,21 +11,11 @@ */ #ifdef CONFIG_BLOCK -struct mpage_data { - struct bio *bio; - sector_t last_block_in_bio; - get_block_t *get_block; - unsigned use_writepage; -}; - struct writeback_control; -struct bio *mpage_bio_submit(int rw, struct bio *bio); int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); -int __mpage_writepage(struct page *page, struct writeback_control *wbc, - void *data); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, -- cgit v1.2.3 From 522b5cc7cec124e06629c0702ffab1307416aec7 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 31 Mar 2009 15:14:39 +1100 Subject: drm: fix missing inline function on 32-bit powerpc. The readq/writeq really need to be static inline on the arches which don't provide them. Reported-by: Benjamin Herrenschmidt Signed-off-by: Dave Airlie --- include/drm/drm_os_linux.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h index 013551d03c0..26641e95e0a 100644 --- a/include/drm/drm_os_linux.h +++ b/include/drm/drm_os_linux.h @@ -7,12 +7,12 @@ #include #ifndef readq -static u64 readq(void __iomem *reg) +static inline u64 readq(void __iomem *reg) { return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32); } -static void writeq(u64 val, void __iomem *reg) +static inline void writeq(u64 val, void __iomem *reg) { writel(val & 0xffffffff, reg); writel(val >> 32, reg + 0x4UL); -- cgit v1.2.3 From 3c6fc3521acbee33637e7db803ac3cf3b0e2ff04 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 2 Apr 2009 11:52:24 +0200 Subject: DRM: drm_crtc_helper.h doesn't actually need i2c.h Remove an include that isn't actually needed to prevent needless rebuilds. Signed-off-by: Jean Delvare Signed-off-by: Dave Airlie --- include/drm/drm_crtc_helper.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h index c7d4b2e606a..95f1a849e68 100644 --- a/include/drm/drm_crtc_helper.h +++ b/include/drm/drm_crtc_helper.h @@ -33,7 +33,6 @@ #ifndef __DRM_CRTC_HELPER_H__ #define __DRM_CRTC_HELPER_H__ -#include #include #include #include -- cgit v1.2.3 From 797108d134a91afca9fa59c572336b279bc66afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 1 Apr 2009 23:15:17 +0000 Subject: tcp: add helper for counter tweaking due mid-wq change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need full-scale adjustment to fix a TCP miscount in the next patch, so just move it into a helper and call for that from the other places. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/net/tcp.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e54c76d7549..1b94b9bfe2d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -616,21 +616,6 @@ static inline int tcp_skb_mss(const struct sk_buff *skb) return skb_shinfo(skb)->gso_size; } -static inline void tcp_dec_pcount_approx_int(__u32 *count, const int decr) -{ - if (*count) { - *count -= decr; - if ((int)*count < 0) - *count = 0; - } -} - -static inline void tcp_dec_pcount_approx(__u32 *count, - const struct sk_buff *skb) -{ - tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb)); -} - /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ -- cgit v1.2.3 From 7a1fb5d06d3936c0982e2cf8b53b046244a9aad6 Mon Sep 17 00:00:00 2001 From: Jesse Barnes Date: Fri, 27 Mar 2009 13:05:19 -0700 Subject: drm: remove unused "can_grow" parameter from drm_crtc_helper_initial_config Cleanup some leftovers from the X port. Signed-off-by: Jesse Barnes Signed-off-by: Dave Airlie --- include/drm/drm_crtc_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h index 95f1a849e68..ec073d8288d 100644 --- a/include/drm/drm_crtc_helper.h +++ b/include/drm/drm_crtc_helper.h @@ -91,7 +91,7 @@ struct drm_connector_helper_funcs { extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY); extern void drm_helper_disable_unused_functions(struct drm_device *dev); extern int drm_helper_hotplug_stage_two(struct drm_device *dev); -extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow); +extern bool drm_helper_initial_config(struct drm_device *dev); extern int drm_crtc_helper_set_config(struct drm_mode_set *set); extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, struct drm_display_mode *mode, -- cgit v1.2.3 From ee3b4290aec03022cfb67c9adba9f1b3215245f0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 2 Apr 2009 16:56:30 -0700 Subject: generic debug pagealloc: build fix This fixes a build failure with generic debug pagealloc: mm/debug-pagealloc.c: In function 'set_page_poison': mm/debug-pagealloc.c:8: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: In function 'clear_page_poison': mm/debug-pagealloc.c:13: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: In function 'page_poison': mm/debug-pagealloc.c:18: error: 'struct page' has no member named 'debug_flags' mm/debug-pagealloc.c: At top level: mm/debug-pagealloc.c:120: error: redefinition of 'kernel_map_pages' include/linux/mm.h:1278: error: previous definition of 'kernel_map_pages' was here mm/debug-pagealloc.c: In function 'kernel_map_pages': mm/debug-pagealloc.c:122: error: 'debug_pagealloc_enabled' undeclared (first use in this function) by fixing - debug_flags should be in struct page - define DEBUG_PAGEALLOC config option for all architectures Signed-off-by: Akinobu Mita Reported-by: Alexander Beregalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ddadb4defe0..0e80e26ecf2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -95,6 +95,9 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS + unsigned long debug_flags; /* Use atomic bitops on this */ +#endif }; /* @@ -175,9 +178,6 @@ struct vm_area_struct { #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif -#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS - unsigned long debug_flags; /* Use atomic bitops on this */ -#endif }; struct core_thread { -- cgit v1.2.3 From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Apr 2009 16:56:32 -0700 Subject: nommu: fix a number of issues with the per-MM VMA patch Fix a number of issues with the per-MM VMA patch: (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on a NOMMU system with more than 2G pages. Makes no difference on a 32-bit system. (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value, lest it overflow. (3) Move the allocation of the vm_area_struct slab back for fork.c. (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs. (5) Use BUG_ON() rather than if () BUG(). (6) Make the default validate_nommu_regions() a static inline rather than a #define. (7) Make free_page_series()'s objection to pages with a refcount != 1 more informative. (8) Adjust the __put_nommu_region() banner comment to indicate that the semaphore must be held for writing. (9) Limit the number of warnings about munmaps of non-mmapped regions. Reported-by: Andrew Morton Signed-off-by: David Howells Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index aeabe953ba4..bff1f0d475c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {} #endif /* nommu.c */ -extern atomic_t mmap_pages_allocated; +extern atomic_long_t mmap_pages_allocated; /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -- cgit v1.2.3 From 8e2c3795c78d5c4e2e1f14ce751e9d08decbe9d3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Apr 2009 16:56:44 -0700 Subject: add fiemap.h to header-y Include fiemap.h in header-y; it defines the interface for the FS_IOC_FIEMAP file mapping ioctl. Signed-off-by: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index a67b6227d27..ca9b9b9bd33 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -67,6 +67,7 @@ header-y += falloc.h header-y += fd.h header-y += fdreg.h header-y += fib_rules.h +header-y += fiemap.h header-y += firewire-cdev.h header-y += firewire-constants.h header-y += fuse.h -- cgit v1.2.3 From 9a896c9a48ac6704c0ce8ee081b836644d0afe40 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 2 Apr 2009 16:56:45 -0700 Subject: mm: define a UNIQUE value for AS_UNEVICTABLE flag A new "address_space flag"--AS_MM_ALL_LOCKS--was defined to use the next available AS flag while the Unevictable LRU was under development. The Unevictable LRU was using the same flag and "no one" noticed. Current mainline, since 2.6.28, has same value for two symbolic flag names. So, define a unique flag value for AS_UNEVICTABLE--up close to the other flags, [at the cost of an additional #ifdef] so we'll notice next time. Note that #ifdef is not actually required, if we don't mind having the unused flag value defined. Replace #defines with an enum. Signed-off-by: Lee Schermerhorn Cc: [2.6.28.x, 2.6.29.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 01ca0856caf..076a7dc67c2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,9 +18,14 @@ * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page * allocation mode flags. */ -#define AS_EIO (__GFP_BITS_SHIFT + 0) /* IO error on async write */ -#define AS_ENOSPC (__GFP_BITS_SHIFT + 1) /* ENOSPC on async write */ -#define AS_MM_ALL_LOCKS (__GFP_BITS_SHIFT + 2) /* under mm_take_all_locks() */ +enum mapping_flags { + AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ + AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ + AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ +#ifdef CONFIG_UNEVICTABLE_LRU + AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ +#endif +}; static inline void mapping_set_error(struct address_space *mapping, int error) { @@ -33,7 +38,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } #ifdef CONFIG_UNEVICTABLE_LRU -#define AS_UNEVICTABLE (__GFP_BITS_SHIFT + 2) /* e.g., ramdisk, SHM_LOCK */ static inline void mapping_set_unevictable(struct address_space *mapping) { -- cgit v1.2.3 From bf6aede712334d7338d5c47a5ee5ba3883c82a61 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 2 Apr 2009 16:56:54 -0700 Subject: workqueue: add to_delayed_work() helper function It is a fairly common operation to have a pointer to a work and to need a pointer to the delayed work it is contained in. In particular, all delayed works which want to rearm themselves will have to do that. So it would seem fair to offer a helper function for this operation. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: "David S. Miller" Cc: Herbert Xu Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Greg KH Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/workqueue.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 3cd51e579ab..13e1adf55c4 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -41,6 +41,11 @@ struct delayed_work { struct timer_list timer; }; +static inline struct delayed_work *to_delayed_work(struct work_struct *work) +{ + return container_of(work, struct delayed_work, work); +} + struct execute_work { struct work_struct work; }; -- cgit v1.2.3 From 06c421ee0d5af95c8c6749ca0ba620cd5010707f Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Thu, 2 Apr 2009 16:56:56 -0700 Subject: memory_accessor: new interface for reading/writing persistent memory Add an interface by which other kernel code can read/write persistent memory such as I2C or SPI EEPROMs, or devices which provide NVRAM. Use cases include storage of board-specific configuration data like Ethernet addresses and sensor calibrations. Original idea, review and improvement suggestions by David Brownell. Acked-by: David Brownell Signed-off-by: Kevin Hilman Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 3fdc10806d3..42767d1a62e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -99,4 +99,15 @@ enum mem_add_context { BOOT, HOTPLUG }; #define hotplug_memory_notifier(fn, pri) do { } while (0) #endif +/* + * 'struct memory_accessor' is a generic interface to provide + * in-kernel access to persistent memory such as i2c or SPI EEPROMs + */ +struct memory_accessor { + ssize_t (*read)(struct memory_accessor *, char *buf, off_t offset, + size_t count); + ssize_t (*write)(struct memory_accessor *, const char *buf, + off_t offset, size_t count); +}; + #endif /* _LINUX_MEMORY_H_ */ -- cgit v1.2.3 From 7274ec8bd71e99018642f474528ea7de4bb3ae25 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Thu, 2 Apr 2009 16:56:57 -0700 Subject: memory_accessor: implement the new memory_accessor interface for I2C EEPROM In the case of at24, the platform code registers a 'setup' callback with the at24_platform_data. When the at24 driver detects an EEPROM, it fills out the read and write functions of the memory_accessor and calls the setup callback passing the memory_accessor struct. The platform code can then use the read/write functions in the memory_accessor struct for reading and writing the EEPROM. Signed-off-by: Kevin Hilman Cc: David Brownell Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/i2c/at24.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/i2c/at24.h b/include/linux/i2c/at24.h index f6edd522a92..8ace93024d6 100644 --- a/include/linux/i2c/at24.h +++ b/include/linux/i2c/at24.h @@ -2,6 +2,7 @@ #define _LINUX_AT24_H #include +#include /* * As seen through Linux I2C, differences between the most common types of I2C @@ -23,6 +24,9 @@ struct at24_platform_data { #define AT24_FLAG_READONLY 0x40 /* sysfs-entry will be read-only */ #define AT24_FLAG_IRUGO 0x20 /* sysfs-entry will be world-readable */ #define AT24_FLAG_TAKE8ADDR 0x10 /* take always 8 addresses (24c00) */ + + void (*setup)(struct memory_accessor *, void *context); + void *context; }; #endif /* _LINUX_AT24_H */ -- cgit v1.2.3 From 14dd1ff0f9e75dd4ae2f1ff8e48becb76d14f4ab Mon Sep 17 00:00:00 2001 From: David Brownell Date: Thu, 2 Apr 2009 16:56:58 -0700 Subject: memory_accessor: implement the new memory_accessor interfaces for SPI EEPROMs - Define new setup() hook to export the accessor - Implement accessor methods Moves some error checking out of the sysfs interface code into the layer below it, which is now shared by both sysfs and memory access code. Signed-off-by: David Brownell Signed-off-by: Kevin Hilman Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/spi/eeprom.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/spi/eeprom.h b/include/linux/spi/eeprom.h index 1085212c446..306e7b1c69e 100644 --- a/include/linux/spi/eeprom.h +++ b/include/linux/spi/eeprom.h @@ -1,6 +1,8 @@ #ifndef __LINUX_SPI_EEPROM_H #define __LINUX_SPI_EEPROM_H +#include + /* * Put one of these structures in platform_data for SPI EEPROMS handled * by the "at25" driver. On SPI, most EEPROMS understand the same core @@ -17,6 +19,10 @@ struct spi_eeprom { #define EE_ADDR2 0x0002 /* 16 bit addrs */ #define EE_ADDR3 0x0004 /* 24 bit addrs */ #define EE_READONLY 0x0008 /* disallow writes */ + + /* for exporting this chip's data to other kernel code */ + void (*setup)(struct memory_accessor *mem, void *context); + void *context; }; #endif /* __LINUX_SPI_EEPROM_H */ -- cgit v1.2.3 From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 2 Apr 2009 16:56:59 -0700 Subject: Simplify copy_thread() First argument unused since 2.3.11. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 481fad3a9b4..9186f8c5d5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ extern struct mm_struct *dup_mm(struct task_struct *tsk); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -- cgit v1.2.3 From 96615841e170f0108832e64a90d51b469573a472 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Apr 2009 16:57:01 -0700 Subject: rtc-v3020: add ability to access v3020 chip with GPIOs The v3020 RTC can be connected to GPIOs as well as to memory-like interface. Add ability to use GPIO bit-bang for v3020 read-write access. [akpm@linux-foundation.org: fix off-by-one in error path] Signed-off-by: Mike Rapoport Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rtc-v3020.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/rtc-v3020.h b/include/linux/rtc-v3020.h index bf74e63c98f..8ba646e610d 100644 --- a/include/linux/rtc-v3020.h +++ b/include/linux/rtc-v3020.h @@ -14,6 +14,12 @@ * is used depends on the board. */ struct v3020_platform_data { int leftshift; /* (1<<(leftshift)) & readl() */ + + int use_gpio:1; + unsigned int gpio_cs; + unsigned int gpio_wr; + unsigned int gpio_rd; + unsigned int gpio_io; }; #define V3020_STATUS_0 0x00 -- cgit v1.2.3 From 926b663ce8215ba448960e1ff6e58b67a2c3b99b Mon Sep 17 00:00:00 2001 From: Daniel Silverstone Date: Thu, 2 Apr 2009 16:57:05 -0700 Subject: gpiolib: allow GPIOs to be named Allow GPIOs in GPIOLIB chips to be named. This name is then used when the GPIO is exported to sysfs, although it could be used elsewhere if deemed useful. Signed-off-by: Daniel Silverstone Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/gpio.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 81797ec9ab2..d6c379dc64f 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -55,6 +55,10 @@ struct module; * handled is (base + ngpio - 1). * @can_sleep: flag must be set iff get()/set() methods sleep, as they * must while accessing GPIO expander chips over I2C or SPI + * @names: if set, must be an array of strings to use as alternative + * names for the GPIOs in this chip. Any entry in the array + * may be NULL if there is no alias for the GPIO, however the + * array must be @ngpio entries long. * * A gpio_chip can help platforms abstract various sources of GPIOs so * they can all be accessed through a common programing interface. @@ -92,6 +96,7 @@ struct gpio_chip { struct gpio_chip *chip); int base; u16 ngpio; + char **names; unsigned can_sleep:1; unsigned exported:1; }; -- cgit v1.2.3 From bfb9bcdbda9a61bca469bf899a589918c60c4c18 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Thu, 2 Apr 2009 16:57:07 -0700 Subject: spi-gpio: allow operation without CS signal Change spi-gpio so that it is possible to drive SPI communications over GPIO without the need for a chipselect signal. This is useful in very small setups where there's only one slave device on the bus. This patch does not affect existing setups. I use this for a tiny communication channel between an embedded device and a microcontroller. There are not enough GPIOs available for chipselect and it's not needed anyway in this case. Signed-off-by: Michael Buesch Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/spi/spi_gpio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi_gpio.h b/include/linux/spi/spi_gpio.h index 0f01a0f1f40..ca6782ee4b9 100644 --- a/include/linux/spi/spi_gpio.h +++ b/include/linux/spi/spi_gpio.h @@ -25,10 +25,16 @@ * ... * }; * + * If chipselect is not used (there's only one device on the bus), assign + * SPI_GPIO_NO_CHIPSELECT to the controller_data: + * .controller_data = (void *) SPI_GPIO_NO_CHIPSELECT; + * * If the bitbanged bus is later switched to a "native" controller, * that platform_device and controller_data should be removed. */ +#define SPI_GPIO_NO_CHIPSELECT ((unsigned long)-1l) + /** * struct spi_gpio_platform_data - parameter for bitbanged SPI master * @sck: number of the GPIO used for clock output -- cgit v1.2.3 From 039fd8ce6258e01ec29f1637f9bf1868dd877c55 Mon Sep 17 00:00:00 2001 From: Cyrus Massoumi Date: Thu, 2 Apr 2009 16:57:12 -0700 Subject: ext3: remove the BKL in ext3/ioctl.c Reformat ext3/ioctl.c to make it look more like ext4/ioctl.c and remove the BKL around ext3_ioctl(). Signed-off-by: Cyrus Massoumi Cc: Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ext3_fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index dd495b8c309..e263acaa405 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -893,9 +893,8 @@ extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); /* ioctl.c */ -extern int ext3_ioctl (struct inode *, struct file *, unsigned int, - unsigned long); -extern long ext3_compat_ioctl (struct file *, unsigned int, unsigned long); +extern long ext3_ioctl(struct file *, unsigned int, unsigned long); +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long); /* namei.c */ extern int ext3_orphan_add(handle_t *, struct inode *); -- cgit v1.2.3 From d20a390a0ee2bf2f692c539c6ce1c829e1080bb5 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 2 Apr 2009 16:57:22 -0700 Subject: cgroups: fix cgroup.h comments Fix the style of some multi-line comments in cgroup.h to match Documentation/CodingStyle Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 499900d0cee..bb8feb9fecc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -47,14 +47,18 @@ enum cgroup_subsys_id { /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { - /* The cgroup that this subsystem is attached to. Useful + /* + * The cgroup that this subsystem is attached to. Useful * for subsystems that want to know about the cgroup - * hierarchy structure */ + * hierarchy structure + */ struct cgroup *cgroup; - /* State maintained by the cgroup system to allow subsystems + /* + * State maintained by the cgroup system to allow subsystems * to be "busy". Should be accessed via css_get(), - * css_tryget() and and css_put(). */ + * css_tryget() and and css_put(). + */ atomic_t refcnt; @@ -120,8 +124,10 @@ static inline void css_put(struct cgroup_subsys_state *css) enum { /* Control Group is dead */ CGRP_REMOVED, - /* Control Group has previously had a child cgroup or a task, - * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ + /* + * Control Group has previously had a child cgroup or a task, + * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) + */ CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, @@ -130,9 +136,10 @@ enum { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* count users of this cgroup. >0 means busy, but doesn't - * necessarily indicate the number of tasks in the - * cgroup */ + /* + * count users of this cgroup. >0 means busy, but doesn't + * necessarily indicate the number of tasks in the cgroup + */ atomic_t count; /* @@ -142,7 +149,7 @@ struct cgroup { struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct cgroup *parent; /* my parent */ + struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ @@ -177,11 +184,12 @@ struct cgroup { struct rcu_head rcu_head; }; -/* A css_set is a structure holding pointers to a set of +/* + * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a - * list_add()/del() can bump the reference count on the entire - * cgroup set for a task. + * list_add()/del() can bump the reference count on the entire cgroup + * set for a task. */ struct css_set { @@ -226,13 +234,8 @@ struct cgroup_map_cb { void *state; }; -/* struct cftype: - * - * The files in the cgroup filesystem mostly have a very simple read/write - * handling, some common function will take care of it. Nevertheless some cases - * (read tasks) are special and therefore I define this structure for every - * kind of file. - * +/* + * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_dentry->d_parent->d_fsdata @@ -241,8 +244,10 @@ struct cgroup_map_cb { #define MAX_CFTYPE_NAME 64 struct cftype { - /* By convention, the name should begin with the name of the - * subsystem, followed by a period */ + /* + * By convention, the name should begin with the name of the + * subsystem, followed by a period + */ char name[MAX_CFTYPE_NAME]; int private; @@ -321,13 +326,17 @@ struct cgroup_scanner { struct ptr_heap *heap; }; -/* Add a new file to the given cgroup directory. Should only be - * called by subsystems from within a populate() method */ +/* + * Add a new file to the given cgroup directory. Should only be + * called by subsystems from within a populate() method + */ int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype *cft); -/* Add a set of new files to the given cgroup directory. Should - * only be called by subsystems from within a populate() method */ +/* + * Add a set of new files to the given cgroup directory. Should + * only be called by subsystems from within a populate() method + */ int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype cft[], @@ -419,7 +428,8 @@ struct cgroup_iter { struct list_head *task; }; -/* To iterate across the tasks in a cgroup: +/* + * To iterate across the tasks in a cgroup: * * 1) call cgroup_iter_start to intialize an iterator * @@ -428,9 +438,10 @@ struct cgroup_iter { * * 3) call cgroup_iter_end() to destroy the iterator. * - * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset. - * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task() - * callback, but not while calling the process_task() callback. + * Or, call cgroup_scan_tasks() to iterate through every task in a + * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling + * the test_task() callback, but not while calling the process_task() + * callback. */ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); struct task_struct *cgroup_iter_next(struct cgroup *cgrp, -- cgit v1.2.3 From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001 From: Grzegorz Nosek Date: Thu, 2 Apr 2009 16:57:23 -0700 Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild cgroups The ns_proxy cgroup allows moving processes to child cgroups only one level deep at a time. This commit relaxes this restriction and makes it possible to attach tasks directly to grandchild cgroups, e.g.: ($pid is in the root cgroup) echo $pid > /cgroup/CG1/CG2/tasks Previously this operation would fail with -EPERM and would have to be performed as two steps: echo $pid > /cgroup/CG1/tasks echo $pid > /cgroup/CG1/CG2/tasks Also, the target cgroup no longer needs to be empty to move a task there. Signed-off-by: Grzegorz Nosek Acked-by: Serge Hallyn Reviewed-by: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bb8feb9fecc..788c4964c14 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); -/* Return true if the cgroup is a descendant of the current cgroup */ -int cgroup_is_descendant(const struct cgroup *cgrp); +/* Return true if cgrp is a descendant of the task's cgroup */ +int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); /* Control Group subsystem type. See Documentation/cgroups.txt for details */ -- cgit v1.2.3 From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:25 -0700 Subject: cgroup: CSS ID support Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code. This patch attaches unique ID to each css and provides following. - css_lookup(subsys, id) returns pointer to struct cgroup_subysys_state of id. - css_get_next(subsys, id, rootid, depth, foundid) returns the next css under "root" by scanning When cgroup_subsys->use_id is set, an id for css is maintained. The cgroup framework only parepares - css_id of root css for subsys - id is automatically attached at creation of css. - id is *not* freed automatically. Because the cgroup framework don't know lifetime of cgroup_subsys_state. free_css_id() function is provided. This must be called by subsys. There are several reasons to develop this. - Saving space .... For example, memcg's swap_cgroup is array of pointers to cgroup. But it is not necessary to be very fast. By replacing pointers(8bytes per ent) to ID (2byes per ent), we can reduce much amount of memory usage. - Scanning without lock. CSS_ID provides "scan id under this ROOT" function. By this, scanning css under root can be written without locks. ex) do { rcu_read_lock(); next = cgroup_get_next(subsys, id, root, &found); /* check sanity of next here */ css_tryget(); rcu_read_unlock(); id = found + 1 } while(...) Characteristics: - Each css has unique ID under subsys. - Lifetime of ID is controlled by subsys. - css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy - Allowed ID is 1-65535, ID 0 is UNUSED ID. Design Choices: - scan-by-ID v.s. scan-by-tree-walk. As /proc's pid scan does, scan-by-ID is robust when scanning is done by following kind of routine. scan -> rest a while(release a lock) -> conitunue from interrupted memcg's hierarchical reclaim does this. - When subsys->use_id is set, # of css in the system is limited to 65535. [bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Bharata B Rao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/idr.h | 1 + 2 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 788c4964c14..9a23bb09820 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -22,6 +23,7 @@ struct cgroupfs_root; struct cgroup_subsys; struct inode; struct cgroup; +struct css_id; extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -63,6 +65,8 @@ struct cgroup_subsys_state { atomic_t refcnt; unsigned long flags; + /* ID for this css, if possible */ + struct css_id *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -373,6 +377,11 @@ struct cgroup_subsys { int active; int disabled; int early_init; + /* + * True if this subsys uses ID. ID is not available before cgroup_init() + * (not available in early_init time.) + */ + bool use_id; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; @@ -395,6 +404,9 @@ struct cgroup_subsys { */ struct cgroupfs_root *root; struct list_head sibling; + /* used when use_id == true */ + struct idr idr; + spinlock_t id_lock; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; @@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); +/* + * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works + * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. + * CSS ID is assigned at cgroup allocation (create) automatically + * and removed when subsys calls free_css_id() function. This is because + * the lifetime of cgroup_subsys_state is subsys's matter. + * + * Looking up and scanning function should be called under rcu_read_lock(). + * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. + * But the css returned by this routine can be "not populated yet" or "being + * destroyed". The caller should check css and cgroup's status. + */ + +/* + * Typically Called at ->destroy(), or somewhere the subsys frees + * cgroup_subsys_state. + */ +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); + +/* Find a cgroup_subsys_state which has given ID */ + +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); + +/* + * Get a cgroup whose id is greater than or equal to id under tree of root. + * Returning a cgroup_subsys_state or NULL. + */ +struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, + struct cgroup_subsys_state *root, int *foundid); + +/* Returns true if root is ancestor of cg */ +bool css_is_ancestor(struct cgroup_subsys_state *cg, + struct cgroup_subsys_state *root); + +/* Get id and depth of css */ +unsigned short css_id(struct cgroup_subsys_state *css); +unsigned short css_depth(struct cgroup_subsys_state *css); + #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } diff --git a/include/linux/idr.h b/include/linux/idr.h index dd846df8cd3..e968db71e33 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *ptr, int *id); int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); int idr_for_each(struct idr *idp, int (*fn)(int id, void *p, void *data), void *data); +void *idr_get_next(struct idr *idp, int *nextid); void *idr_replace(struct idr *idp, void *ptr, int id); void idr_remove(struct idr *idp, int id); void idr_remove_all(struct idr *idp); -- cgit v1.2.3 From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:26 -0700 Subject: cgroup: fix frequent -EBUSY at rmdir In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9a23bb09820..7d824b80b3d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -135,6 +135,10 @@ enum { CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, + /* + * A thread in rmdir() is wating for this cgroup. + */ + CGRP_WAIT_ON_RMDIR, }; struct cgroup { @@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); + int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk); -- cgit v1.2.3 From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 2 Apr 2009 16:57:29 -0700 Subject: cgroups: show correct file mode We have some read-only files and write-only files, but currently they are all set to 0644, which is counter-intuitive and cause trouble for some cgroup tools like libcgroup. This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's own files' file mode, and for the most cases cft->mode can be default to 0 and cgroup will figure out proper mode. Acked-by: Paul Menage Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7d824b80b3d..b2816fba530 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -258,6 +258,11 @@ struct cftype { */ char name[MAX_CFTYPE_NAME]; int private; + /* + * If not 0, file mode is set to this value, otherwise it will + * be figured out automatically + */ + mode_t mode; /* * If non-zero, defines the maximum length of string that can -- cgit v1.2.3 From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:38 -0700 Subject: memcg: fix OOM killer under memcg This patch tries to fix OOM Killer problems caused by hierarchy. Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to kill a task in memcg. But, when hierarchy is used, it's broken and correct task cannot be killed. For example, in following cgroup /groupA/ hierarchy=1, limit=1G, 01 nolimit 02 nolimit All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to groupA's 1Gbytes but OOM Killer just kills tasks in groupA. This patch provides makes the bad process be selected from all tasks under hierarchy. BTW, currently, oom_jiffies is updated against groupA in above case. oom_jiffies of tree should be updated. To see how oom_jiffies is used, please check mem_cgroup_oom_called() callers. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: const fix] Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b2816fba530..43763bd772b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, /* Returns true if root is ancestor of cg */ bool css_is_ancestor(struct cgroup_subsys_state *cg, - struct cgroup_subsys_state *root); + const struct cgroup_subsys_state *root); /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -- cgit v1.2.3 From e222432bfa7dcf6ec008622a978c9f284ed5e3a9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 2 Apr 2009 16:57:39 -0700 Subject: memcg: show memcg information during OOM Add RSS and swap to OOM output from memcg Display memcg values like failcnt, usage and limit when an OOM occurs due to memcg. Thanks to Johannes Weiner, Li Zefan, David Rientjes, Kamezawa Hiroyuki, Daisuke Nishimura and KOSAKI Motohiro for review. Sample output ------------- Task in /a/x killed as a result of limit of /a memory: usage 1048576kB, limit 1048576kB, failcnt 4183 memory+swap: usage 1400964kB, limit 9007199254740991kB, failcnt 0 [akpm@linux-foundation.org: compilation fix] [akpm@linux-foundation.org: fix kerneldoc and whitespace] [akpm@linux-foundation.org: add printk facility level] Signed-off-by: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Li Zefan Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 326f45c8653..7aba9f26462 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -104,6 +104,8 @@ struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone); struct zone_reclaim_stat* mem_cgroup_get_reclaim_stat_from_page(struct page *page); +extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, + struct task_struct *p); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; @@ -270,6 +272,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; } +static inline void +mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +{ +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From c137b5ece4b111e46981aae7da77315b9909809f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Apr 2009 16:57:40 -0700 Subject: memcg: remove mem_cgroup_calc_mapped_ratio() Currently, mem_cgroup_calc_mapped_ratio() is unused at all. it can be removed and KAMEZAWA-san suggested it. Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7aba9f26462..4562d09ab96 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -88,7 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, /* * For memory reclaim. */ -extern int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem); extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem); extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem); @@ -211,11 +210,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, { } -static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) -{ - return 0; -} - static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) { return 0; -- cgit v1.2.3 From 3918b96e03b2b8dd05889320623f6870e81d35ec Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Apr 2009 16:57:41 -0700 Subject: memcg: remove mem_cgroup_reclaim_imbalance() remnants commit 4f98a2fee8acdb4ac84545df98cccecfd130f8db (vmscan: split LRU lists into anon & file sets) removed mem_cgroup_reclaim_imbalance(), but there are some leftovers in memcontrol.h. Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4562d09ab96..18146c980b6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -88,8 +88,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, /* * For memory reclaim. */ -extern long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem); - extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem); extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority); @@ -210,11 +208,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, { } -static inline int mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - return 0; -} - static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) { return 0; -- cgit v1.2.3 From a3b2d692690aef228e493b1beaafe5364cab3237 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:45 -0700 Subject: cgroups: use css id in swap cgroup for saving memory v5 Try to use CSS ID for records in swap_cgroup. By this, on 64bit machine, size of swap_cgroup goes down to 2 bytes from 8bytes. This means, when 2GB of swap is equipped, (assume the page size is 4096bytes) From size of swap_cgroup = 2G/4k * 8 = 4Mbytes. To size of swap_cgroup = 2G/4k * 2 = 1Mbytes. Reduction is large. Of course, there are trade-offs. This CSS ID will add overhead to swap-in/swap-out/swap-free. But in general, - swap is a resource which the user tend to avoid use. - If swap is never used, swap_cgroup area is not used. - Reading traditional manuals, size of swap should be proportional to size of memory. Memory size of machine is increasing now. I think reducing size of swap_cgroup makes sense. Note: - ID->CSS lookup routine has no locks, it's under RCU-Read-Side. - memcg can be obsolete at rmdir() but not freed while refcnt from swap_cgroup is available. Changelog v4->v5: - reworked on to memcg-charge-swapcache-to-proper-memcg.patch Changlog ->v4: - fixed not configured case. - deleted unnecessary comments. - fixed NULL pointer bug. - fixed message in dmesg. [nishimura@mxp.nes.nec.co.jp: css_tryget can be called twice in !PageCgroupUsed case] Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Cc: Paul Menage Cc: Hugh Dickins Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 602cc1fdee9..7339c7bf733 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -91,24 +91,23 @@ static inline void page_cgroup_init(void) #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP #include -extern struct mem_cgroup * -swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem); -extern struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent); +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); +extern unsigned short lookup_swap_cgroup(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); #else #include static inline -struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - return NULL; + return 0; } static inline -struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup(swp_entry_t ent) { - return NULL; + return 0; } static inline int -- cgit v1.2.3 From bd1a8ab73edd449fecda633449cc277b856ad4f5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 2 Apr 2009 16:57:50 -0700 Subject: cgroups: add 'data' field to struct cgroup_scanner We need to pass some data to test_task() or process_task() in some cases. Will be used later. Signed-off-by: Li Zefan Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 43763bd772b..4316a546beb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -337,6 +337,7 @@ struct cgroup_scanner { void (*process_task)(struct task_struct *p, struct cgroup_scanner *scan); struct ptr_heap *heap; + void *data; }; /* -- cgit v1.2.3 From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 2 Apr 2009 16:57:54 -0700 Subject: cpusets: replace zone allowed functions with node allowed The cpuset_zone_allowed() variants are actually only a function of the zone's node. Cc: Paul Menage Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: David Rientjes Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2e0d79678de..05ea1dd7d68 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_CPUSETS @@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void); void cpuset_update_task_memory_state(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); -extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); +extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); +extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); -static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { return number_of_cpusets <= 1 || - __cpuset_zone_allowed_softwall(z, gfp_mask); + __cpuset_node_allowed_softwall(node, gfp_mask); } -static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { return number_of_cpusets <= 1 || - __cpuset_zone_allowed_hardwall(z, gfp_mask); + __cpuset_node_allowed_hardwall(node, gfp_mask); +} + +static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) +{ + return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask); +} + +static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) +{ + return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask); } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } +static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +{ + return 1; +} + +static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) +{ + return 1; +} + static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) { return 1; -- cgit v1.2.3 From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:00 -0700 Subject: signals: remove 'handler' parameter to tracehook functions Container-init must behave like global-init to processes within the container and hence it must be immune to unhandled fatal signals from within the container (i.e SIG_DFL signals that terminate the process). But the same container-init must behave like a normal process to processes in ancestor namespaces and so if it receives the same fatal signal from a process in ancestor namespace, the signal must be processed. Implementing these semantics requires that send_signal() determine pid namespace of the sender but since signals can originate from workqueues/ interrupt-handlers, determining pid namespace of sender may not always be possible or safe. This patchset implements the design/simplified semantics suggested by Oleg Nesterov. The simplified semantics for container-init are: - container-init must never be terminated by a signal from a descendant process. - container-init must never be immune to SIGKILL from an ancestor namespace (so a process in parent namespace must always be able to terminate a descendant container). - container-init may be immune to unhandled fatal signals (like SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP are the only reliable signals to a container-init from ancestor namespace. This patch: Based on an earlier patch submitted by Oleg Nesterov and comments from Roland McGrath (http://lkml.org/lkml/2008/11/19/258). The handler parameter is currently unused in the tracehook functions. Besides, the tracehook functions are called with siglock held, so the functions can check the handler if they later need to. Removing the parameter simiplifies changes to sig_ignored() in a follow-on patch. Signed-off-by: Sukadev Bhattiprolu Acked-by: Roland McGrath Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Daniel Lezcano Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 6186a789d6c..eb4c6545b38 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -388,17 +388,14 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal * @task: task receiving the signal * @sig: signal number being sent - * @handler: %SIG_IGN or %SIG_DFL * * Return zero iff tracing doesn't care to examine this ignored signal, * so it can short-circuit normal delivery and never even get queued. - * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN. * * Called with @task->sighand->siglock held. */ static inline int tracehook_consider_ignored_signal(struct task_struct *task, - int sig, - void __user *handler) + int sig) { return (task_ptrace(task) & PT_PTRACED) != 0; } @@ -407,19 +404,17 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task, * tracehook_consider_fatal_signal - suppress special handling of fatal signal * @task: task receiving the signal * @sig: signal number being sent - * @handler: %SIG_DFL or %SIG_IGN * * Return nonzero to prevent special handling of this termination signal. - * Normally @handler is %SIG_DFL. It can be %SIG_IGN if @sig is ignored, - * in which case force_sig() is about to reset it to %SIG_DFL. + * Normally handler for signal is %SIG_DFL. It can be %SIG_IGN if @sig is + * ignored, in which case force_sig() is about to reset it to %SIG_DFL. * When this returns zero, this signal might cause a quick termination * that does not give the debugger a chance to intercept the signal. * * Called with or without @task->sighand->siglock held. */ static inline int tracehook_consider_fatal_signal(struct task_struct *task, - int sig, - void __user *handler) + int sig) { return (task_ptrace(task) & PT_PTRACED) != 0; } -- cgit v1.2.3 From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:14 -0700 Subject: ptrace: fix possible zombie leak on PTRACE_DETACH When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed. If it has already passed exit_notify() we can leak a zombie, because a) ptracing disables the auto-reaping logic, and b) ->real_parent was not notified about the child's death. ptrace_detach() should follow the ptrace_exit's logic, change the code accordingly. Signed-off-by: Oleg Nesterov Cc: Jerome Marchand Cc: Roland McGrath Tested-by: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 98b93ca4db0..1a2b0cb5553 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p); extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 -- cgit v1.2.3 From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:18 -0700 Subject: forget_original_parent: split out the un-ptrace part By discussion with Roland. - Rename ptrace_exit() to exit_ptrace(), and change it to do all the necessary work with ->ptraced list by its own. - Move this code from exit.c to ptrace.c - Update the comment in ptrace_detach() to explain the rechecking of the child->ptrace. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: "Metzger, Markus T" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 2 +- include/linux/sched.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 1a2b0cb5553..67c15653fc2 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); -extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p); +extern void exit_ptrace(struct task_struct *tracer); extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 diff --git a/include/linux/sched.h b/include/linux/sched.h index 9186f8c5d5f..b47c94e7560 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also -- cgit v1.2.3 From bb24c679a51b1a9b726b901330649e3861814ac0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:20 -0700 Subject: tracehook_notify_death: use task_detached() helper Now that task_detached() is exported, change tracehook_notify_death() to use this helper, nobody else checks ->exit_signal == -1 by hand. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: "Metzger, Markus T" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index eb4c6545b38..c7aa154f4bf 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -502,7 +502,7 @@ static inline int tracehook_notify_jctl(int notify, int why) static inline int tracehook_notify_death(struct task_struct *task, void **death_cookie, int group_dead) { - if (task->exit_signal == -1) + if (task_detached(task)) return task->ptrace ? SIGCHLD : DEATH_REAP; /* -- cgit v1.2.3 From 40e8a10de2c9f87e892dcd5a6f9d1b208329ffea Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 2 Apr 2009 16:58:25 -0700 Subject: cpu hotplug: remove unused cpuhotplug_mutex_lock() cpuhotplug_mutex_lock() is not used, remove it. Signed-off-by: Lai Jiangshan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Gautham R Shenoy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index c2747ac2ae4..2643d848df9 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -23,7 +23,6 @@ #include #include #include -#include struct cpu { int node_id; /* The node which contains the CPU */ @@ -103,16 +102,6 @@ extern struct sysdev_class cpu_sysdev_class; #ifdef CONFIG_HOTPLUG_CPU /* Stop CPUs going up and down. */ -static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex) -{ - mutex_lock(cpu_hp_mutex); -} - -static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex) -{ - mutex_unlock(cpu_hp_mutex); -} - extern void get_online_cpus(void); extern void put_online_cpus(void); #define hotcpu_notifier(fn, pri) { \ @@ -126,11 +115,6 @@ int cpu_down(unsigned int cpu); #else /* CONFIG_HOTPLUG_CPU */ -static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex) -{ } -static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex) -{ } - #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) -- cgit v1.2.3 From a50b0aa4bd9a7d42112442a385f3dc0e775284dd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 2 Apr 2009 16:58:29 -0700 Subject: struct linux_binprm: drop unused fields Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 77b4a9e4600..6638b8148de 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -35,8 +35,7 @@ struct linux_binprm{ #endif struct mm_struct *mm; unsigned long p; /* current top of mem */ - unsigned int sh_bang:1, - misc_bang:1, + unsigned int cred_prepared:1,/* true if creds already prepared (multiple * preps happen for interpreters) */ cap_effective:1;/* true if has elevated effective capabilities, -- cgit v1.2.3 From 1f80769ffd36e74357fe896dc43dddf1af1510f3 Mon Sep 17 00:00:00 2001 From: Paul Fulghum Date: Thu, 2 Apr 2009 16:58:30 -0700 Subject: synclink_gt: add clock options Add support for x8 asynchronous sample rate and ability to specify base clock frequency. Signed-off-by: Paul Fulghum Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/synclink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/synclink.h b/include/linux/synclink.h index 99b8bdb17b2..0ff2779c44d 100644 --- a/include/linux/synclink.h +++ b/include/linux/synclink.h @@ -125,6 +125,7 @@ #define MGSL_MODE_MONOSYNC 3 #define MGSL_MODE_BISYNC 4 #define MGSL_MODE_RAW 6 +#define MGSL_MODE_BASE_CLOCK 7 #define MGSL_BUS_TYPE_ISA 1 #define MGSL_BUS_TYPE_EISA 2 -- cgit v1.2.3 From 6dda81f4384b94930826eded254d8c16f89a9248 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:35 -0700 Subject: pids: document task_pgrp/task_session is not safe without tasklist/rcu Even if task == current, it is not safe to dereference the result of task_pgrp/task_session. We can race with another thread which changes the special pid via setpgid/setsid. Document this. The next 2 patches give an example of the unsafe usage, we have more bad users. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b47c94e7560..722dd313bf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1489,6 +1489,11 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->group_leader->pids[PIDTYPE_PID].pid; } +/* + * Without tasklist or rcu lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->group_leader->pids[PIDTYPE_PGID].pid; -- cgit v1.2.3 From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:38 -0700 Subject: pids: refactor vnr/nr_ns helpers to make them safe Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy. task_pid_nr_ns(task) needs rcu/tasklist depending on task == current. As for "special" pids, vnr/nr_ns helpers always need rcu. However, if task != current, they are unsafe even under rcu lock, we can't trust task->group_leader without the special checks. And almost every helper has a callsite which needs a fix. Also, it is a bit annoying that the implementations of, say, task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical". This patch introduces the new helper, __task_pid_nr_ns(), which is always safe to use, and turns all other helpers into the trivial wrappers. After this I'll send another patch which converts task_tgid_xxx() as well, they're are a bit special. Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 722dd313bf8..49df878a0ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1519,17 +1519,23 @@ struct pid_namespace; * * see also pid_nr() etc in include/linux/pid.h */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } -pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} static inline pid_t task_pid_vnr(struct task_struct *tsk) { - return pid_vnr(task_pid(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } @@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return tsk->signal->__pgrp; } -pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { - return pid_vnr(task_pgrp(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } @@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk) return tsk->signal->__session; } -pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +static inline pid_t task_session_nr_ns(struct task_struct *tsk, + struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} static inline pid_t task_session_vnr(struct task_struct *tsk) { - return pid_vnr(task_session(tsk)); + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. -- cgit v1.2.3 From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:39 -0700 Subject: pids: kill signal_struct-> __pgrp/__session and friends We are wasting 2 words in signal_struct without any reason to implement task_pgrp_nr() and task_session_nr(). task_session_nr() has no callers since 2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it. task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and fs/coda. This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills __pgrp/__session and the related helpers. The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense anyway. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox [tty parts] Cc: Cedric Le Goater Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 43 ++++++------------------------------------- 1 file changed, 6 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 49df878a0ca..206ac003e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,25 +547,8 @@ struct signal_struct { struct list_head cpu_timers[3]; - /* job control IDs */ - - /* - * pgrp and session fields are deprecated. - * use the task_session_Xnr and task_pgrp_Xnr routines below - */ - - union { - pid_t pgrp __deprecated; - pid_t __pgrp; - }; - struct pid *tty_old_pgrp; - union { - pid_t session __deprecated; - pid_t __session; - }; - /* boolean value for session group leader */ int leader; @@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -static inline void set_task_session(struct task_struct *tsk, pid_t session) -{ - tsk->signal->__session = session; -} - -static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) -{ - tsk->signal->__pgrp = pgrp; -} - static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; @@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk) } -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return tsk->signal->__pgrp; -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk) } -static inline pid_t task_session_nr(struct task_struct *tsk) -{ - return tsk->signal->__session; -} - static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +/* obsolete, do not use */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. -- cgit v1.2.3 From 7c5ff4f92e2b47c56d777a5adbadd9a52841b635 Mon Sep 17 00:00:00 2001 From: Harry Ciao Date: Thu, 2 Apr 2009 16:58:48 -0700 Subject: pci: Add AMD8111 PCI Bridge PCI Device ID Add the PCI Device ID of the PCI Bridge Controller on AMD8111 chip. Signed-off-by: Harry Ciao Cc: Doug Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index cb14fd26083..170f8b1f22d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -526,6 +526,7 @@ #define PCI_DEVICE_ID_AMD_OPUS_7443 0x7443 #define PCI_DEVICE_ID_AMD_VIPER_7443 0x7443 #define PCI_DEVICE_ID_AMD_OPUS_7445 0x7445 +#define PCI_DEVICE_ID_AMD_8111_PCI 0x7460 #define PCI_DEVICE_ID_AMD_8111_LPC 0x7468 #define PCI_DEVICE_ID_AMD_8111_IDE 0x7469 #define PCI_DEVICE_ID_AMD_8111_SMBUS2 0x746a -- cgit v1.2.3 From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 2 Apr 2009 16:58:57 -0700 Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists It would be nice to be able to extract the dmesg log from a vmcore file without needing to keep the debug symbols for the running kernel handy all the time. We have a facility to do this in /proc/vmcore. This patch adds the log_buf and log_end symbols to the vmcoreinfo area so that tools (like makedumpfile) can easily extract the dmesg logs from a vmcore image. [akpm@linux-foundation.org: several fixes and cleanups] [akpm@linux-foundation.org: fix unused log_buf_kexec_setup()] [akpm@linux-foundation.org: build fix] Signed-off-by: Neil Horman Cc: Simon Horman Acked-by: Vivek Goyal Cc: Neil Horman Cc: Simon Horman Cc: Vivek Goyal Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e720b0da775..556d781e69f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -242,6 +242,7 @@ extern struct ratelimit_state printk_ratelimit_state; extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); +void log_buf_kexec_setup(void); #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -253,6 +254,9 @@ static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } +static inline void log_buf_kexec_setup(void) +{ +} #endif extern int printk_needs_cpu(int cpu); -- cgit v1.2.3 From f3554f4bc69803ac2baaf7cf2aa4339e1f4b693e Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Thu, 2 Apr 2009 16:59:23 -0700 Subject: preadv/pwritev: Add preadv and pwritev system calls. This patch adds preadv and pwritev system calls. These syscalls are a pretty straightforward combination of pread and readv (same for write). They are quite useful for doing vectored I/O in threaded applications. Using lseek+readv instead opens race windows you'll have to plug with locking. Other systems have such system calls too, for example NetBSD, check here: http://www.daemon-systems.org/man/preadv.2.html The application-visible interface provided by glibc should look like this to be compatible to the existing implementations in the *BSD family: ssize_t preadv(int d, const struct iovec *iov, int iovcnt, off_t offset); ssize_t pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset); This prototype has one problem though: On 32bit archs is the (64bit) offset argument unaligned, which the syscall ABI of several archs doesn't allow to do. At least s390 needs a wrapper in glibc to handle this. As we'll need a wrappers in glibc anyway I've decided to push problem to glibc entriely and use a syscall prototype which works without arch-specific wrappers inside the kernel: The offset argument is explicitly splitted into two 32bit values. The patch sports the actual system call implementation and the windup in the x86 system call tables. Other archs follow as separate patches. Signed-off-by: Gerd Hoffmann Cc: Arnd Bergmann Cc: Al Viro Cc: Cc: Cc: Ralf Baechle Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 6 ++++++ include/linux/syscalls.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index b880864672d..9723edd6455 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -191,6 +191,12 @@ asmlinkage ssize_t compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen); asmlinkage ssize_t compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsigned long vlen); +asmlinkage ssize_t compat_sys_preadv(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low); +asmlinkage ssize_t compat_sys_pwritev(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low); int compat_do_execve(char * filename, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs * regs); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f9f900cfd06..b299a82a05e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -461,6 +461,10 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos); asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos); +asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low); +asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, u32 pos_high, u32 pos_low); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, int mode); asmlinkage long sys_chdir(const char __user *filename); -- cgit v1.2.3 From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 2 Apr 2009 16:59:45 -0700 Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS SGI has observed that on large systems, interrupts are not serviced for a long period of time when waiting for a rwlock. The following patch series re-enables irqs while waiting for the lock, resembling the code which is already there for spinlocks. I only made the ia64 version, because the patch adds some overhead to the fast path. I assume there is currently no demand to have this for other architectures, because the systems are not so large. Of course, the possibility to implement raw_{read|write}_lock_flags for any architecture is still there. This patch: The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation depending on the config options, so that IRQ's are re-enabled when possible, but they remain disabled if CONFIG_LOCKDEP is set. Signed-off-by: Petr Tesarik Signed-off-by: Robin Holt Cc: Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5a58ea3e91e..da5a5a1f4cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -364,6 +364,23 @@ do { \ #endif /* CONFIG_LOCK_STAT */ +#ifdef CONFIG_LOCKDEP + +/* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_*_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ + LOCK_CONTENDED((_lock), (try), (lock)) + +#else /* CONFIG_LOCKDEP */ + +#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ + lockfl((_lock), (flags)) + +#endif /* CONFIG_LOCKDEP */ + #ifdef CONFIG_GENERIC_HARDIRQS extern void early_init_irq_lock_class(void); #else -- cgit v1.2.3 From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 2 Apr 2009 16:59:46 -0700 Subject: Allow rwlocks to re-enable interrupts Pass the original flags to rwlock arch-code, so that it can re-enable interrupts if implemented for that architecture. Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs which just do the same thing as non-flags variants. Signed-off-by: Petr Tesarik Signed-off-by: Robin Holt Acked-by: Peter Zijlstra Cc: Acked-by: Ingo Molnar Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-m32r/spinlock.h | 3 +++ include/linux/spinlock.h | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h index f5cfba81ee1..dded923883b 100644 --- a/include/asm-m32r/spinlock.h +++ b/include/asm-m32r/spinlock.h @@ -316,6 +316,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock) return 0; } +#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) +#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) + #define _raw_spin_relax(lock) cpu_relax() #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index a0c66a2e00a..252b245cfcf 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -153,9 +153,11 @@ do { \ extern int _raw_spin_trylock(spinlock_t *lock); extern void _raw_spin_unlock(spinlock_t *lock); extern void _raw_read_lock(rwlock_t *lock); +#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock) extern int _raw_read_trylock(rwlock_t *lock); extern void _raw_read_unlock(rwlock_t *lock); extern void _raw_write_lock(rwlock_t *lock); +#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock) extern int _raw_write_trylock(rwlock_t *lock); extern void _raw_write_unlock(rwlock_t *lock); #else @@ -165,9 +167,13 @@ do { \ # define _raw_spin_trylock(lock) __raw_spin_trylock(&(lock)->raw_lock) # define _raw_spin_unlock(lock) __raw_spin_unlock(&(lock)->raw_lock) # define _raw_read_lock(rwlock) __raw_read_lock(&(rwlock)->raw_lock) +# define _raw_read_lock_flags(lock, flags) \ + __raw_read_lock_flags(&(lock)->raw_lock, *(flags)) # define _raw_read_trylock(rwlock) __raw_read_trylock(&(rwlock)->raw_lock) # define _raw_read_unlock(rwlock) __raw_read_unlock(&(rwlock)->raw_lock) # define _raw_write_lock(rwlock) __raw_write_lock(&(rwlock)->raw_lock) +# define _raw_write_lock_flags(lock, flags) \ + __raw_write_lock_flags(&(lock)->raw_lock, *(flags)) # define _raw_write_trylock(rwlock) __raw_write_trylock(&(rwlock)->raw_lock) # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) #endif -- cgit v1.2.3