/*
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* 'fork.c' contains the help-routines for the 'fork' system call
* (see also entry.S and others).
* Fork is rather simple, once you get the hang of it, but the memory
* management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
*/
#include <linux/config.h>
#include <linux/malloc.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
/* The idle threads do not count.. */
int nr_threads;
int nr_running;
int max_threads;
unsigned long total_forks; /* Handle normal Linux uptimes. */
int last_pid;
struct task_struct *pidhash[PIDHASH_SZ];
37 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
41 wq_write_lock_irqsave(&q->lock, flags);
wait->flags = 0;
__add_wait_queue(q, wait);
44 wq_write_unlock_irqrestore(&q->lock, flags);
}
47 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
51 wq_write_lock_irqsave(&q->lock, flags);
wait->flags = WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(q, wait);
54 wq_write_unlock_irqrestore(&q->lock, flags);
}
57 void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
61 wq_write_lock_irqsave(&q->lock, flags);
__remove_wait_queue(q, wait);
63 wq_write_unlock_irqrestore(&q->lock, flags);
}
66 void __init fork_init(unsigned long mempages)
{
/*
* The default maximum number of threads is set to a safe
* value: the thread structures can take up at most half
* of memory.
*/
max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
}
/* Protects next_safe and last_pid. */
spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
82 static int get_pid(unsigned long flags)
{
static int next_safe = PID_MAX;
struct task_struct *p;
87 if (flags & CLONE_PID)
88 return current->pid;
spin_lock(&lastpid_lock);
91 if((++last_pid) & 0xffff8000) {
last_pid = 300; /* Skip daemons etc. */
93 goto inside;
}
95 if(last_pid >= next_safe) {
inside:
next_safe = PID_MAX;
read_lock(&tasklist_lock);
repeat:
100 for_each_task(p) {
if(p->pid == last_pid ||
p->pgrp == last_pid ||
103 p->session == last_pid) {
104 if(++last_pid >= next_safe) {
105 if(last_pid & 0xffff8000)
last_pid = 300;
next_safe = PID_MAX;
}
109 goto repeat;
}
111 if(p->pid > last_pid && next_safe > p->pid)
next_safe = p->pid;
113 if(p->pgrp > last_pid && next_safe > p->pgrp)
next_safe = p->pgrp;
115 if(p->session > last_pid && next_safe > p->session)
next_safe = p->session;
}
118 read_unlock(&tasklist_lock);
}
120 spin_unlock(&lastpid_lock);
122 return last_pid;
}
125 static inline int dup_mmap(struct mm_struct * mm)
{
struct vm_area_struct * mpnt, *tmp, **pprev;
int retval;
130 flush_cache_mm(current->mm);
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_avl = NULL;
mm->mmap_cache = NULL;
mm->map_count = 0;
mm->cpu_vm_mask = 0;
mm->swap_cnt = 0;
mm->swap_address = 0;
pprev = &mm->mmap;
140 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
struct file *file;
retval = -ENOMEM;
144 if(mpnt->vm_flags & VM_DONTCOPY)
145 continue;
tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
147 if (!tmp)
148 goto fail_nomem;
*tmp = *mpnt;
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
mm->map_count++;
tmp->vm_next = NULL;
file = tmp->vm_file;
155 if (file) {
struct inode *inode = file->f_dentry->d_inode;
get_file(file);
158 if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
/* insert tmp into the share list, just after mpnt */
spin_lock(&inode->i_mapping->i_shared_lock);
163 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
mpnt->vm_next_share->vm_pprev_share =
&tmp->vm_next_share;
mpnt->vm_next_share = tmp;
tmp->vm_pprev_share = &mpnt->vm_next_share;
168 spin_unlock(&inode->i_mapping->i_shared_lock);
}
/* Copy the pages, but defer checking for errors */
retval = copy_page_range(mm, current->mm, tmp);
173 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
/*
* Link in the new vma even if an error occurred,
* so that exit_mmap() can clean up the mess.
*/
*pprev = tmp;
pprev = &tmp->vm_next;
183 if (retval)
184 goto fail_nomem;
}
retval = 0;
187 if (mm->map_count >= AVL_MIN_MAP_COUNT)
build_mmap_avl(mm);
fail_nomem:
flush_tlb_mm(current->mm);
192 return retval;
}
spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
200 static struct mm_struct * mm_init(struct mm_struct * mm)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_MUTEX(&mm->mmap_sem);
mm->page_table_lock = SPIN_LOCK_UNLOCKED;
mm->pgd = pgd_alloc();
207 if (mm->pgd)
208 return mm;
free_mm(mm);
210 return NULL;
}
/*
* Allocate and initialize an mm_struct.
*/
217 struct mm_struct * mm_alloc(void)
{
struct mm_struct * mm;
mm = allocate_mm();
222 if (mm) {
memset(mm, 0, sizeof(*mm));
224 return mm_init(mm);
}
226 return NULL;
}
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
* mmput. Free the page directory and the mm.
*/
234 inline void __mmdrop(struct mm_struct *mm)
{
236 if (mm == &init_mm) BUG();
pgd_free(mm->pgd);
238 destroy_context(mm);
free_mm(mm);
}
/*
* Decrement the use count and release all resources for an mm.
*/
245 void mmput(struct mm_struct *mm)
{
247 if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
list_del(&mm->mmlist);
249 spin_unlock(&mmlist_lock);
exit_mmap(mm);
mmdrop(mm);
}
}
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
*
* mm_release is called after a mm_struct has been removed
* from the current process.
*
* This difference is important for error handling, when we
* only half set up a mm_struct for a new process and need to restore
* the old one. Because we mmput the new mm_struct before
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
268 void mm_release(void)
{
struct task_struct *tsk = current;
/* notify parent sleeping on vfork() */
273 if (tsk->flags & PF_VFORK) {
tsk->flags &= ~PF_VFORK;
up(tsk->p_opptr->vfork_sem);
}
}
279 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
struct mm_struct * mm, *oldmm;
int retval;
tsk->min_flt = tsk->maj_flt = 0;
tsk->cmin_flt = tsk->cmaj_flt = 0;
tsk->nswap = tsk->cnswap = 0;
tsk->mm = NULL;
tsk->active_mm = NULL;
/*
* Are we cloning a kernel thread?
*
* We need to steal a active VM for that..
*/
oldmm = current->mm;
297 if (!oldmm)
298 return 0;
300 if (clone_flags & CLONE_VM) {
atomic_inc(&oldmm->mm_users);
mm = oldmm;
303 goto good_mm;
}
retval = -ENOMEM;
mm = allocate_mm();
308 if (!mm)
309 goto fail_nomem;
/* Copy the current MM stuff.. */
memcpy(mm, oldmm, sizeof(*mm));
313 if (!mm_init(mm))
314 goto fail_nomem;
down(&oldmm->mmap_sem);
retval = dup_mmap(mm);
up(&oldmm->mmap_sem);
/*
* Add it to the mmlist after the parent.
*
* Doing it this way means that we can order
* the list, and fork() won't mess up the
* ordering significantly.
*/
spin_lock(&mmlist_lock);
list_add(&mm->mmlist, &oldmm->mmlist);
329 spin_unlock(&mmlist_lock);
331 if (retval)
332 goto free_pt;
/*
* child gets a private LDT (if there was an LDT in the parent)
*/
copy_segments(tsk, mm);
339 if (init_new_context(tsk,mm))
340 goto free_pt;
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
345 return 0;
free_pt:
mmput(mm);
fail_nomem:
350 return retval;
}
353 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
{
struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
/* We don't need to lock fs - think why ;-) */
357 if (fs) {
atomic_set(&fs->count, 1);
fs->lock = RW_LOCK_UNLOCKED;
fs->umask = old->umask;
read_lock(&old->lock);
fs->rootmnt = mntget(old->rootmnt);
fs->root = dget(old->root);
fs->pwdmnt = mntget(old->pwdmnt);
fs->pwd = dget(old->pwd);
366 if (old->altroot) {
fs->altrootmnt = mntget(old->altrootmnt);
fs->altroot = dget(old->altroot);
369 } else {
fs->altrootmnt = NULL;
fs->altroot = NULL;
}
373 read_unlock(&old->lock);
}
375 return fs;
}
378 struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
380 return __copy_fs_struct(old);
}
383 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
{
385 if (clone_flags & CLONE_FS) {
atomic_inc(¤t->fs->count);
387 return 0;
}
tsk->fs = __copy_fs_struct(current->fs);
390 if (!tsk->fs)
391 return -1;
392 return 0;
}
395 static int count_open_files(struct files_struct *files, int size)
{
int i;
/* Find the last open fd */
400 for (i = size/(8*sizeof(long)); i > 0; ) {
401 if (files->open_fds->fds_bits[--i])
402 break;
}
i = (i+1) * 8 * sizeof(long);
405 return i;
}
408 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
struct file **old_fds, **new_fds;
int open_files, nfds, size, i, error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
418 if (!oldf)
419 goto out;
421 if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
423 goto out;
}
tsk->files = NULL;
error = -ENOMEM;
newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
429 if (!newf)
430 goto out;
atomic_set(&newf->count, 1);
newf->file_lock = RW_LOCK_UNLOCKED;
newf->next_fd = 0;
newf->max_fds = NR_OPEN_DEFAULT;
newf->max_fdset = __FD_SETSIZE;
newf->close_on_exec = &newf->close_on_exec_init;
newf->open_fds = &newf->open_fds_init;
newf->fd = &newf->fd_array[0];
/* We don't yet have the oldf readlock, but even if the old
fdset gets grown now, we'll only copy up to "size" fds */
size = oldf->max_fdset;
445 if (size > __FD_SETSIZE) {
newf->max_fdset = 0;
write_lock(&newf->file_lock);
error = expand_fdset(newf, size);
449 write_unlock(&newf->file_lock);
450 if (error)
451 goto out_release;
}
read_lock(&oldf->file_lock);
open_files = count_open_files(oldf, size);
/*
* Check whether we need to allocate a larger fd array.
* Note: we're not a clone task, so the open count won't
* change.
*/
nfds = NR_OPEN_DEFAULT;
463 if (open_files > nfds) {
464 read_unlock(&oldf->file_lock);
newf->max_fds = 0;
write_lock(&newf->file_lock);
error = expand_fd_array(newf, open_files);
468 write_unlock(&newf->file_lock);
469 if (error)
470 goto out_release;
nfds = newf->max_fds;
read_lock(&oldf->file_lock);
}
old_fds = oldf->fd;
new_fds = newf->fd;
memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
481 for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
483 if (f)
get_file(f);
*new_fds++ = f;
}
487 read_unlock(&oldf->file_lock);
/* compute the remainder to be cleared */
size = (newf->max_fds - open_files) * sizeof(struct file *);
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
495 if (newf->max_fdset > open_files) {
int left = (newf->max_fdset-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
memset(&newf->open_fds->fds_bits[start], 0, left);
memset(&newf->close_on_exec->fds_bits[start], 0, left);
}
tsk->files = newf;
error = 0;
out:
506 return error;
out_release:
free_fdset (newf->close_on_exec, newf->max_fdset);
free_fdset (newf->open_fds, newf->max_fdset);
kmem_cache_free(files_cachep, newf);
512 goto out;
}
515 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct signal_struct *sig;
519 if (clone_flags & CLONE_SIGHAND) {
atomic_inc(¤t->sig->count);
521 return 0;
}
sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
tsk->sig = sig;
525 if (!sig)
526 return -1;
527 spin_lock_init(&sig->siglock);
atomic_set(&sig->count, 1);
memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
530 return 0;
}
533 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
new_flags |= PF_FORKNOEXEC;
539 if (!(clone_flags & CLONE_PTRACE))
p->ptrace = 0;
541 if (clone_flags & CLONE_VFORK)
new_flags |= PF_VFORK;
p->flags = new_flags;
}
/*
* Ok, this is the main fork-routine. It copies the system process
* information (task[nr]) and sets up the necessary registers. It also
* copies the data segment in its entirety. The "stack_start" and
* "stack_top" arguments are simply passed along to the platform
* specific copy_thread() routine. Most platforms ignore stack_top.
* For an example that's using stack_top, see
* arch/ia64/kernel/process.c.
*/
555 int do_fork(unsigned long clone_flags, unsigned long stack_start,
struct pt_regs *regs, unsigned long stack_size)
{
int retval = -ENOMEM;
struct task_struct *p;
DECLARE_MUTEX_LOCKED(sem);
562 if (clone_flags & CLONE_PID) {
/* This is only allowed from the boot up thread */
564 if (current->pid)
565 return -EPERM;
}
current->vfork_sem = &sem;
p = alloc_task_struct();
571 if (!p)
572 goto fork_out;
*p = *current;
retval = -EAGAIN;
577 if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
578 goto bad_fork_free;
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
/*
* Counter increases are protected by
* the kernel lock so nr_threads can't
* increase under us (but it may decrease).
*/
587 if (nr_threads >= max_threads)
588 goto bad_fork_cleanup_count;
590 get_exec_domain(p->exec_domain);
592 if (p->binfmt && p->binfmt->module)
__MOD_INC_USE_COUNT(p->binfmt->module);
p->did_exec = 0;
p->swappable = 0;
p->state = TASK_UNINTERRUPTIBLE;
copy_flags(clone_flags, p);
p->pid = get_pid(clone_flags);
p->run_list.next = NULL;
p->run_list.prev = NULL;
605 if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
p->p_opptr = current;
607 if (!(p->ptrace & PT_PTRACED))
p->p_pptr = current;
}
p->p_cptr = NULL;
init_waitqueue_head(&p->wait_chldexit);
p->vfork_sem = NULL;
613 spin_lock_init(&p->alloc_lock);
p->sigpending = 0;
init_sigpending(&p->pending);
p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
p->leader = 0; /* session leadership doesn't inherit */
p->tty_old_pgrp = 0;
p->times.tms_utime = p->times.tms_stime = 0;
p->times.tms_cutime = p->times.tms_cstime = 0;
#ifdef CONFIG_SMP
{
int i;
p->has_cpu = 0;
p->processor = current->processor;
/* ?? should we just memset this ?? */
for(i = 0; i < smp_num_cpus; i++)
p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
spin_lock_init(&p->sigmask_lock);
}
#endif
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
retval = -ENOMEM;
/* copy all the process information */
643 if (copy_files(clone_flags, p))
644 goto bad_fork_cleanup;
645 if (copy_fs(clone_flags, p))
646 goto bad_fork_cleanup_files;
647 if (copy_sighand(clone_flags, p))
648 goto bad_fork_cleanup_fs;
649 if (copy_mm(clone_flags, p))
650 goto bad_fork_cleanup_sighand;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
652 if (retval)
653 goto bad_fork_cleanup_sighand;
p->semundo = NULL;
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
p->swappable = 1;
p->exit_signal = clone_flags & CSIGNAL;
p->pdeath_signal = 0;
/*
* "share" dynamic priority between parent and child, thus the
* total amount of dynamic priorities in the system doesnt change,
* more scheduling fairness. This is only important in the first
* timeslice, on the long run the scheduling behaviour is unchanged.
*/
p->counter = (current->counter + 1) >> 1;
current->counter >>= 1;
674 if (!current->counter)
current->need_resched = 1;
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
*
* Let it rip!
*/
retval = p->pid;
p->tgid = retval;
685 INIT_LIST_HEAD(&p->thread_group);
686 write_lock_irq(&tasklist_lock);
687 if (clone_flags & CLONE_THREAD) {
p->tgid = current->tgid;
list_add(&p->thread_group, ¤t->thread_group);
}
691 SET_LINKS(p);
hash_pid(p);
nr_threads++;
694 write_unlock_irq(&tasklist_lock);
696 if (p->ptrace & PT_PTRACED)
send_sig(SIGSTOP, p, 1);
wake_up_process(p); /* do this last */
++total_forks;
fork_out:
703 if ((clone_flags & CLONE_VFORK) && (retval > 0))
down(&sem);
705 return retval;
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup:
714 put_exec_domain(p->exec_domain);
715 if (p->binfmt && p->binfmt->module)
__MOD_DEC_USE_COUNT(p->binfmt->module);
bad_fork_cleanup_count:
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task_struct(p);
722 goto fork_out;
}
/* SLAB cache for signal_struct structures (tsk->sig) */
kmem_cache_t *sigact_cachep;
/* SLAB cache for files_struct structures (tsk->files) */
kmem_cache_t *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
kmem_cache_t *fs_cachep;
/* SLAB cache for vm_area_struct structures */
kmem_cache_t *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
kmem_cache_t *mm_cachep;
740 void __init proc_caches_init(void)
{
sigact_cachep = kmem_cache_create("signal_act",
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
745 if (!sigact_cachep)
panic("Cannot create signal action SLAB cache");
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
751 if (!files_cachep)
panic("Cannot create files SLAB cache");
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
757 if (!fs_cachep)
panic("Cannot create fs_struct SLAB cache");
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
763 if(!vm_area_cachep)
panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
769 if(!mm_cachep)
panic("vma_init: Cannot alloc mm_struct SLAB cache");
}