- 論壇徽章:
- 0
|
創(chuàng)建時間:2007-10-18
文章屬性:原創(chuàng)
文章提交:pr0cess (pr0cess_at_cnbct.org)
Linux2.6內(nèi)核進程創(chuàng)建過程分析
/*Kernel version: linux-2.6.22.9
*作者:旋木木
*日期:2007/10/17
*E-mail:xuanmumu@gmail.com
*/
Fork的系統(tǒng)調(diào)用代碼在linux/arch/i386/kernel/process.c中:
asmlinkage int sys_fork(struct pt_regs regs)
{
return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
}
Sys_fork系統(tǒng)調(diào)用通過 do_fork()函數(shù)實現(xiàn),通過對do_fork()函數(shù)傳遞不同的clone_flags來實現(xiàn)fork,clone,vfork。
Syn_clone和syn_vfork的系統(tǒng)調(diào)用代碼如下:
asmlinkage int sys_clone(struct pt_regs regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
clone_flags = regs.ebx;
newsp = regs.ecx;
parent_tidptr = (int __user *)regs.edx;
child_tidptr = (int __user *)regs.edi;
if (!newsp)
newsp = regs.esp;
return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr);
}
asmlinkage int sys_vfork(struct pt_regs regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL);
}
其中clone_flas在include\linux\sched.h中定義
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* 進程退出時需要傳遞的信號*/
#define CLONE_VM 0x00000100 /* 父子進程共享地址空間 */
#define CLONE_FS 0x00000200 /* 父子進程共享文件系統(tǒng)信息 */
#define CLONE_FILES 0x00000400 /* 父子進程共享已打開的文件 */
#define CLONE_SIGHAND 0x00000800 /* 父子進程共享信號處理 */
#define CLONE_PTRACE 0x00002000 /* 繼續(xù)調(diào)試子進程 */
#define CLONE_VFORK 0x00004000 /* 調(diào)用vfork(),父進程休眠*/
#define CLONE_PARENT 0x00008000 /* 設(shè)置一個共有的父進程 */
#define CLONE_THREAD 0x00010000 /* 父子進程在同一個線程組 */
#define CLONE_NEWNS 0x00020000 /* 為子進程創(chuàng)建一個新的命名空間 */
#define CLONE_SYSVSEM 0x00040000 /* 父子進程共享system V SEM_UNDO */
#define CLONE_SETTLS 0x00080000 /* 為子進程創(chuàng)建新的TLS */
#define CLONE_PARENT_SETTID 0x00100000 /* 設(shè)置父進程TID */
#define CLONE_CHILD_CLEARTID 0x00200000 /* 清除子進程TID */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* 不允許調(diào)試子進程 */
#define CLONE_CHILD_SETTID 0x01000000 /* 設(shè)置子進程TID */
#define CLONE_STOPPED 0x02000000 /* 設(shè)置進程停止狀態(tài) */
#define CLONE_NEWUTS 0x04000000 /* 創(chuàng)建新的utsname組 */
#define CLONE_NEWIPC 0x08000000 /* 創(chuàng)建新的IPC */
Do_fork()在kernel/fork.c中定義,代碼如下:
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
struct task_struct *p;
int trace = 0;
struct pid *pid = alloc_pid();
long nr;
if (!pid)
return -EAGAIN;
nr = pid->nr;
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
p->state = TASK_STOPPED;
if (unlikely (trace)) {
current->ptrace_message = nr;
ptrace_notify ((trace << 8) | SIGTRAP);
}
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
current->ptrace_message = nr;
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
}
}
} else {
free_pid(pid);
nr = PTR_ERR(p);
}
return nr;
}
Do_fork()函數(shù)的核心是copy_process()函數(shù),該函數(shù)完成了進程創(chuàng)建的絕大部分工作并且也在fork.c定義,copy_process函數(shù)較長,逐段往下看:
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid)
{
int retval;
struct task_struct *p = NULL;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
goto fork_out;
rt_mutex_init_task(p);
#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
這段代碼首先對傳入的clone_flag進行檢查,接著調(diào)用了dup_task_struct()函數(shù),該函數(shù)的主要作用是:為子進程創(chuàng)建一個新的內(nèi)核棧,復(fù)制task_struct結(jié)構(gòu)和thread_info結(jié)構(gòu),這里只是對結(jié)構(gòu)完整的復(fù)制,所以子進程的進程描述符跟父進程完全一樣。跟進 dup_task_struct()函數(shù)看代碼:
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
prepare_to_copy(orig);
tsk = alloc_task_struct();
if (!tsk)
return NULL;
ti = alloc_thread_info(tsk);
if (!ti) {
free_task_struct(tsk);
return NULL;
}
*tsk = *orig;
tsk->stack = ti;
setup_thread_stack(tsk, orig);
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
#endif
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
return tsk;
}
通過alloc_task_struct()函數(shù)創(chuàng)建內(nèi)核棧和task_struct結(jié)構(gòu)空間,alloc_task_struct()函數(shù)定義為
# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
接著分配thread_info結(jié)構(gòu)空間
ti = alloc_thread_info(tsk);
thread_info結(jié)構(gòu)定義在asm/thread_info.h中
struct thread_info {
struct task_struct *task;
struct exec_domain *exec_domain;
unsigned long flags;
unsigned long status;
__u32 cpu;
__s32 preempt_count;
mm_segment_t addr_limit;
struct restart_block restart_block;
unsigned long previous_esp;
__u8 supervisor_stack[0];
};
繼續(xù)
*tsk = *orig;
為整個task_struct結(jié)構(gòu)復(fù)制
再調(diào)用setup_thread_stack()函數(shù)為thread_info結(jié)構(gòu)復(fù)制
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
*task_thread_info(p) = *task_thread_info(org);
task_thread_info(p)->task = p;
}
其中
task_thread_info(p)->task = p;
thread_info結(jié)構(gòu)中的task成員中存放的是指向當(dāng)前進程task_struct結(jié)構(gòu)的指針。
回到copy_process()函數(shù),繼續(xù)看:
if (atomic_read(&p->user->processes) >=
p->signal-> rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != &root_user)
goto bad_fork_free;
}
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
get_group_info(p->group_info);
首先看前面的兩個if,第一個if里面的rlim數(shù)組包含在task_sturct數(shù)組中。對進程占用的資源數(shù)做出限制,rlim [RLIMIT_NPROC]限制了改進程用戶可以擁有的總進程數(shù)量,如果當(dāng)前用戶所擁有的進程數(shù)量超過了規(guī)定的最大擁有進程數(shù)量,在2.4內(nèi)核中就直接 goto bad_fork_free了。第2個if使用了capable()函數(shù)來對權(quán)限做出檢查,檢查是否有權(quán)對指定的資源進行操作,該函數(shù)返回0則代表無權(quán)操作。該函數(shù)的定義在linux/capability.h中,其中包含了與之相對應(yīng)的權(quán)限列表。
在task_struct結(jié)構(gòu)中有一個指針user,該指針指向一個user_struct結(jié)構(gòu),一個用戶的多個進程可以通過user指針共享該用戶的資源信息,該結(jié)構(gòu)定義在include/linux/sched.h中:
struct user_struct {
atomic_t __count; /*統(tǒng)計用戶擁有進程數(shù)量的計數(shù)器 */
atomic_t processes; /*統(tǒng)計用戶擁有進程數(shù) */
atomic_t files; /* 統(tǒng)計用戶打開的文件數(shù) */
atomic_t sigpending; /* 統(tǒng)計用戶擁有的信號 */
#ifdef CONFIG_INOTIFY_USER
atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif
/* protected by mq_lock */
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
unsigned long locked_shm; /* How many pages of mlocked shm ? */
#ifdef CONFIG_KEYS
struct key *uid_keyring; /* UID specific keyring */
struct key *session_keyring; /* UID's default session keyring */
#endif
/* Hash table maintenance information */
struct list_head uidhash_list;
uid_t uid;
};
既然新創(chuàng)建了一個進程,自然要更新該用戶的user_struct結(jié)構(gòu),累加相應(yīng)的計數(shù)器,這個工作就由atomic_inc()函數(shù)完成,atomic_inc函數(shù)定義在include/asm-blackfin/atomic.h中:
static __inline__ void atomic_inc(volatile atomic_t * v)
{
long flags;
local_irq_save(flags);
v->counter++;
local_irq_restore(flags);
}
函數(shù)保存當(dāng)前各成員的標記,然后進行累加,最后更新各成員,完成累加計數(shù)器的操作。
繼續(xù)看copy_process函數(shù)的代碼:
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid_nr(pid);
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
goto bad_fork_cleanup_delays_binfmt;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
代碼段
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
檢查創(chuàng)建的進程是否超過了系統(tǒng)進程總量
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
獲得進程執(zhí)行域
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
不同進程所執(zhí)行的程序的格式也不一樣,系統(tǒng)對不同格式的支持通過動態(tài)安裝驅(qū)動模塊實現(xiàn),task_struct結(jié)構(gòu)中有一個指向linux_binfmt結(jié)構(gòu)的指針,獲得進程執(zhí)行程序映象。
copy_flags(clone_flags, p);
調(diào)用copy_flags函數(shù)更新task_struct結(jié)構(gòu)中flags成員。表明進程是否擁有超級用戶權(quán)限的PF_SUPERPPRIV標志被清除,表明進程還沒有exec()的PF_FORKNOEXEC被設(shè)置,相關(guān)實現(xiàn)代碼也在fork..c中:
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
new_flags |= PF_FORKNOEXEC;
if (!(clone_flags & CLONE_PTRACE))
p->ptrace = 0;
p->flags = new_flags;
}
接著p->pid = pid_nr(pid);獲取一個PID
p->vfork_done = NULL;
vfork()在調(diào)用copy_process()時,task_struct結(jié)構(gòu)的vfork_done成員被設(shè)置為NULL,在回到do_fork()執(zhí)行時vfork_done會指向一個特殊的地址,這在do_fork中可以清楚的看到。
繼續(xù)走下去:
p->utime = cputime_zero;
p->stime = cputime_zero;
p->sched_time = 0;
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0;
…………………..
開始漫長的對子進程task_struct結(jié)構(gòu)的初始化
…………………..
…………………..
…………………..
…………………..
繼續(xù)
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
如果設(shè)置了同在一個線程組則繼承TGID。對于普通進程來說TGID和PID相等,對于線程來說,同一線程組內(nèi)的所有線程的TGID都相等,這使得這些多線程可以通過調(diào)用getpid()獲得相同的PID。
又該繼續(xù)了-_-….
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespaces;
對task_struct結(jié)構(gòu)的初始化完了就該繼續(xù)copy其他的資源了,這部分調(diào)用的函數(shù)較多,基本都是在fork.c中定義的,比如copy_files():
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
/*
* Note: we may be using current for both targets (See exec.c)
* This works because we cache current->files (old) as oldf. Don't
* break this.
*/
tsk->files = NULL;
newf = dup_fd(oldf, &error);
if (!newf)
goto out;
tsk->files = newf;
error = 0;
out:
return error;
}
task_struct結(jié)構(gòu)中有一個指針flies指向一個file_struct結(jié)構(gòu),因為是從當(dāng)前進程復(fù)制到子進程,所以oldf = current->files;
然后
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
如果設(shè)置了CLONE_FILES,也就是CLONE_FILES=1,就只是共享,通過調(diào)用atomic_inc(這個函數(shù)之前說過了)增加共享計數(shù),之前復(fù)制整個task_struct結(jié)構(gòu)時,把flies指針也復(fù)制給子進程了,所以子進程可以通過指針共享file_sturct結(jié)構(gòu),不要忘記 fork()函數(shù)調(diào)用傳遞的clone_flags都為0,既不是簡單共享而是全部復(fù)制。
接著調(diào)用dup_fd函數(shù)來進行復(fù)制。慶幸的是該函數(shù)定義也在fork.c中,不幸的是該函數(shù)又是瘋狂調(diào)用其他函數(shù)…
由于代碼長不全部列舉了,進入dup_fd函數(shù)中去:
newf = alloc_files();
調(diào)用了alloc_files(),跟進alloc_files()函數(shù):
static struct files_struct *alloc_files(void)
{
struct files_struct *newf;
struct fdtable *fdt;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
goto out;
atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
fdt->next = NULL;
rcu_assign_pointer(newf->fdt, fdt);
out:
return newf;
}
調(diào)用kmem_cache_alloc函數(shù)來為子進程分配一個file_struct結(jié)構(gòu),接著設(shè)置這個新的file_struct結(jié)構(gòu)的count成員
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
這3個指針分別指向:位圖close_on_exec_init,位圖open_fds_init,數(shù)組fd_array[],這3個成員大小都是固定的。
出來后就開始進行copy,把oldf的內(nèi)容copy到新創(chuàng)建的newf中。
中間繼續(xù)復(fù)制其他資源,只有當(dāng)clone_flags為0時才是真正的復(fù)制
Do_fork()之前調(diào)用了dup_task_struct函數(shù)分配了2個連續(xù)頁面,低端存放task_strust結(jié)構(gòu),高端作為系統(tǒng)空間堆棧,由 copy_thread來完成。該函數(shù)復(fù)制父進程的系統(tǒng)空間堆棧,堆棧中有完整路線指明父進程通過系統(tǒng)調(diào)用進入內(nèi)核空間的過程,子進程退出時需要按照完整路線返回。
struct pt_regs * regs結(jié)構(gòu)存放著進入內(nèi)核空間前各寄存器的內(nèi)容。如果完全復(fù)制父進程的系統(tǒng)空間堆棧則無法區(qū)分子進程和父進程,所以要對子進程的相關(guān)內(nèi)容進行調(diào)整。
struct pt_regs * childregs;
struct task_struct *tsk;
int err;
childregs = task_pt_regs(p);
*childregs = *regs;
childregs->eax = 0;
childregs->esp = esp;
首先將eax設(shè)0,作為系統(tǒng)調(diào)用結(jié)束時的返回值
hildregs->esp = esp;
指出進程在用戶態(tài)的堆棧地址,該值在fork()中為傳遞進去的regs.esp
p->thread.esp = (unsigned long) childregs;
p->thread.esp0 = (unsigned long) (childregs+1);
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(gs,p->thread.gs);
P指向的task_struct結(jié)構(gòu)中有一個thread指針,指向一個thread_struct結(jié)構(gòu),里面記錄著進程切換時的堆棧指針,在子進程中也需要進行調(diào)整
p->thread.esp = (unsigned long) childregs;
指向子進程的pt_regs結(jié)構(gòu)起始地址
p->thread.esp0 = (unsigned long) (childregs+1);
指向子進程的系統(tǒng)空間棧頂,當(dāng)進程被調(diào)度運行時,內(nèi)核會將這個值寫入esp0字段,標志該進程在ring0運行時的堆棧地址。
p->thread.eip = (unsigned long) ret_from_fork;
指向當(dāng)進程下一次被切換運行時的入口處
savesegment(gs,p->thread.gs);
把當(dāng)前段寄存器的gs的值保存在thread.gs中
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;
p->parent_exec_id = p->self_exec_id;
設(shè)置子進程的執(zhí)行域
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
設(shè)置子進程退出時要象父進程發(fā)送的信號
最后將子進程連入進程隊列等待被喚醒,再處理其他的一些收尾工作然后返回一個指向子進程的指針。
/*
*暈死了…
*/
回到do_fork()函數(shù)中
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
調(diào)用vfork
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
p->state = TASK_STOPPED;
喚醒子進程并開始運行。
至此,一個進程創(chuàng)建就完成了。 |
|