赞
踩
内核资料直通车:最新Linux内核源码资料文档+视频资料
学习直通车:Linux内核源码/内存调优/文件系统/进程管理/设备驱动/网络协议栈
进程描述task_struct数据结构
- struct task_struct {
- // 进程状态
- volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
- void *stack; // 指向内核栈指针
- atomic_t usage; // 有几个进程使用此结构
- unsigned int flags; /* per process flags, defined below */
- unsigned int ptrace; // ptrace系统调用 实现断点调试 跟踪进程运行
-
- // 条件编译多处理器用到
- #ifdef CONFIG_SMP
- struct llist_node wake_entry;
- int on_cpu;
- unsigned int wakee_flips;
- unsigned long wakee_flip_decay_ts;
- struct task_struct *last_wakee;
-
- int wake_cpu;
- #endif
-
- // 运行队列和进程调试相关程序
- int on_rq;
-
- int prio, static_prio, normal_prio; //关于进程调试
- unsigned int rt_priority; //优先级
-
- // 关于进程
- const struct sched_class *sched_class;
- struct sched_entity se;
- struct sched_rt_entity rt;
- // 结构体链表
- #ifdef CONFIG_CGROUP_SCHED
- struct task_group *sched_task_group;
- #endif
- struct sched_dl_entity dl;
-
- #ifdef CONFIG_PREEMPT_NOTIFIERS
- /* list of struct preempt_notifier: */
- struct hlist_head preempt_notifiers;
- #endif
-
- // 块设备I/O的跟踪工具
- #ifdef CONFIG_BLK_DEV_IO_TRACE
- unsigned int btrace_seq;
- #endif
- // 进程调试策略相关的字段
- unsigned int policy;
- int nr_cpus_allowed;
- cpumask_t cpus_allowed;
-
- // RCU同步原语
- #ifdef CONFIG_PREEMPT_RCU
- int rcu_read_lock_nesting;
- union rcu_special rcu_read_unlock_special;
- struct list_head rcu_node_entry;
- struct rcu_node *rcu_blocked_node;
- #endif /* #ifdef CONFIG_PREEMPT_RCU */
- #ifdef CONFIG_TASKS_RCU
- unsigned long rcu_tasks_nvcsw;
- bool rcu_tasks_holdout;
- struct list_head rcu_tasks_holdout_list;
- int rcu_tasks_idle_cpu;
- #endif /* #ifdef CONFIG_TASKS_RCU */
-
- #ifdef CONFIG_SCHED_INFO
- struct sched_info sched_info;
- #endif
- //进程链表架构
- struct list_head tasks;
- #ifdef CONFIG_SMP
- struct plist_node pushable_tasks;
- struct rb_node pushable_dl_tasks;
- #endif
- // 进程管理地址空间,每个进程有独立的地址空间4G
- struct mm_struct *mm, *active_mm;
- /* per-thread vma caching */
- u32 vmacache_seqnum;
- struct vm_area_struct *vmacache[VMACACHE_SIZE];
- #if defined(SPLIT_RSS_COUNTING)
- struct task_rss_stat rss_stat;
- #endif
-
- // 进程状态参数
- /* task state */
- int exit_state;
- int exit_code, exit_signal;
- // 接收父进程终止时就会发出信号
- int pdeath_signal; /* The signal sent when the parent dies */
- unsigned long jobctl; /* JOBCTL_*, siglock protected */
-
- /* Used for emulating ABI behavior of previous Linux versions */
- unsigned int personality;
-
- /* scheduler bits, serialized by scheduler locks */
- unsigned sched_reset_on_fork:1;
- unsigned sched_contributes_to_load:1;
- unsigned sched_migrated:1;
- unsigned :0; /* force alignment to the next boundary */
-
- /* unserialized, strictly 'current' */
- unsigned in_execve:1; /* bit to tell LSMs we're in execve */
- unsigned in_iowait:1;
- #ifdef CONFIG_MEMCG
- unsigned memcg_may_oom:1;
- #endif
- #ifdef CONFIG_MEMCG_KMEM
- unsigned memcg_kmem_skip_account:1;
- #endif
- #ifdef CONFIG_COMPAT_BRK
- unsigned brk_randomized:1;
- #endif
-
- unsigned long atomic_flags; /* Flags needing atomic access. */
-
- struct restart_block restart_block;
-
- // 进程pid, 父进程tgid
- pid_t pid;
- pid_t tgid;
-
- // 防止内核堆栈溢出
- #ifdef CONFIG_CC_STACKPROTECTOR
- /* Canary value for the -fstack-protector gcc feature */
- unsigned long stack_canary;
- #endif
- /*
- * pointers to (original) parent process, youngest child, younger sibling,
- * older sibling, respectively. (p->father can be replaced with
- * p->real_parent->pid)
- */
- // 初始化父进程
- struct task_struct __rcu *real_parent; /* real parent process */
- // 接收中止进程
- struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
- /*
- * children/sibling forms the list of my natural children
- */
- // 维护子进程链表
- struct list_head children; /* list of my children */
- // 兄弟进程链表
- struct list_head sibling; /* linkage in my parent's children list */
- struct task_struct *group_leader; /* threadgroup leader */
-
- /*
- * ptraced is the list of tasks this task is using ptrace on.
- * This includes both natural children and PTRACE_ATTACH targets.
- * p->ptrace_entry is p's link on the p->parent->ptraced list.
- */
- // 系统调用 关于断开调试
- struct list_head ptraced;
- struct list_head ptrace_entry;
-
- /* PID/PID hash table linkage. */
- // 散列表的关系
- struct pid_link pids[PIDTYPE_MAX];
- struct list_head thread_group;
- struct list_head thread_node;
- // do_fork()函数
- struct completion *vfork_done; /* for vfork() */
- int __user *set_child_tid; /* CLONE_CHILD_SETTID */
- int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
- // 描述CPU时间的内容
- // utime 用户态下的执行时间
- // stime 内核态的执行时间
- cputime_t utime, stime, utimescaled, stimescaled;
- cputime_t gtime;
- struct prev_cputime prev_cputime;
- #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_t vtime_seqlock;
- unsigned long long vtime_snap;
- enum {
- VTIME_SLEEPING = 0,
- VTIME_USER,
- VTIME_SYS,
- } vtime_snap_whence;
- #endif
- unsigned long nvcsw, nivcsw; /* context switch counts */
- u64 start_time; /* monotonic time in nsec */
- u64 real_start_time; /* boot based time in nsec */
- /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
- unsigned long min_flt, maj_flt;
-
- struct task_cputime cputime_expires;
- struct list_head cpu_timers[3];
-
- /* process credentials */
- const struct cred __rcu *real_cred; /* objective and real subjective task
- * credentials (COW) */
- const struct cred __rcu *cred; /* effective (overridable) subjective task
- * credentials (COW) */
- char comm[TASK_COMM_LEN]; /* executable name excluding path
- - access with [gs]et_task_comm (which lock
- it with task_lock())
- - initialized normally by setup_new_exec */
- /* file system info */
- struct nameidata *nameidata;
- #ifdef CONFIG_SYSVIPC
- /* ipc stuff */
- struct sysv_sem sysvsem;
- struct sysv_shm sysvshm;
- #endif
- #ifdef CONFIG_DETECT_HUNG_TASK
- /* hung task detection */
- unsigned long last_switch_count;
- #endif
- /* filesystem information */
- struct fs_struct *fs;
- /* open file information */
- struct files_struct *files;
- /* namespaces */
- struct nsproxy *nsproxy;
- /* signal handlers */
- struct signal_struct *signal;
- struct sighand_struct *sighand;
-
- sigset_t blocked, real_blocked;
- sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
- struct sigpending pending;
-
- unsigned long sas_ss_sp;
- size_t sas_ss_size;
-
- struct callback_head *task_works;
-
- struct audit_context *audit_context;
- #ifdef CONFIG_AUDITSYSCALL
- kuid_t loginuid;
- unsigned int sessionid;
- #endif
- struct seccomp seccomp;
-
- /* Thread group tracking */
- u32 parent_exec_id;
- u32 self_exec_id;
- /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
- spinlock_t alloc_lock;
-
- /* Protection of the PI data structures: */
- raw_spinlock_t pi_lock;
-
- struct wake_q_node wake_q;
-
- #ifdef CONFIG_RT_MUTEXES
- /* PI waiters blocked on a rt_mutex held by this task */
- struct rb_root pi_waiters;
- struct rb_node *pi_waiters_leftmost;
- /* Deadlock detection and priority inheritance handling */
- struct rt_mutex_waiter *pi_blocked_on;
- #endif
-
- #ifdef CONFIG_DEBUG_MUTEXES
- /* mutex deadlock detection */
- struct mutex_waiter *blocked_on;
- #endif
- #ifdef CONFIG_TRACE_IRQFLAGS
- unsigned int irq_events;
- unsigned long hardirq_enable_ip;
- unsigned long hardirq_disable_ip;
- unsigned int hardirq_enable_event;
- unsigned int hardirq_disable_event;
- int hardirqs_enabled;
- int hardirq_context;
- unsigned long softirq_disable_ip;
- unsigned long softirq_enable_ip;
- unsigned int softirq_disable_event;
- unsigned int softirq_enable_event;
- int softirqs_enabled;
- int softirq_context;
- #endif
- #ifdef CONFIG_LOCKDEP
- # define MAX_LOCK_DEPTH 48UL
- u64 curr_chain_key;
- int lockdep_depth;
- unsigned int lockdep_recursion;
- struct held_lock held_locks[MAX_LOCK_DEPTH];
- gfp_t lockdep_reclaim_gfp;
- #endif
-
- /* journalling filesystem info */
- void *journal_info;
-
- /* stacked block device info */
- struct bio_list *bio_list;
-
- #ifdef CONFIG_BLOCK
- /* stack plugging */
- struct blk_plug *plug;
- #endif
-
- /* VM state */
- // 虚拟内存状态参数 内存回收
- struct reclaim_state *reclaim_state;
- // 存放块设备I/O流量信息
- struct backing_dev_info *backing_dev_info;
- // I/O调度器所用的信息
- struct io_context *io_context;
-
- unsigned long ptrace_message;
- siginfo_t *last_siginfo; /* For ptrace use. */
- struct task_io_accounting ioac;
- #if defined(CONFIG_TASK_XACCT)
- u64 acct_rss_mem1; /* accumulated rss usage */
- u64 acct_vm_mem1; /* accumulated virtual memory usage */
- cputime_t acct_timexpd; /* stime + utime since last update */
- #endif
- #ifdef CONFIG_CPUSETS
- nodemask_t mems_allowed; /* Protected by alloc_lock */
- seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
- int cpuset_mem_spread_rotor;
- int cpuset_slab_spread_rotor;
- #endif
- #ifdef CONFIG_CGROUPS
- /* Control Group info protected by css_set_lock */
- struct css_set __rcu *cgroups;
- /* cg_list protected by css_set_lock and tsk->alloc_lock */
- struct list_head cg_list;
- #endif
- #ifdef CONFIG_FUTEX
- struct robust_list_head __user *robust_list;
- #ifdef CONFIG_COMPAT
- struct compat_robust_list_head __user *compat_robust_list;
- #endif
- struct list_head pi_state_list;
- struct futex_pi_state *pi_state_cache;
- #endif
- // 内存检测工具
- #ifdef CONFIG_PERF_EVENTS
- struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
- struct mutex perf_event_mutex;
- struct list_head perf_event_list;
- #endif
- #ifdef CONFIG_DEBUG_PREEMPT
- unsigned long preempt_disable_ip;
- #endif
- #ifdef CONFIG_NUMA
- struct mempolicy *mempolicy; /* Protected by alloc_lock */
- short il_next;
- short pref_node_fork;
- #endif
- #ifdef CONFIG_NUMA_BALANCING
- int numa_scan_seq;
- unsigned int numa_scan_period;
- unsigned int numa_scan_period_max;
- int numa_preferred_nid;
- unsigned long numa_migrate_retry;
- u64 node_stamp; /* migration stamp */
- u64 last_task_numa_placement;
- u64 last_sum_exec_runtime;
- struct callback_head numa_work;
-
- struct list_head numa_entry;
- struct numa_group *numa_group;
-
- /*
- * numa_faults is an array split into four regions:
- * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
- * in this precise order.
- *
- * faults_memory: Exponential decaying average of faults on a per-node
- * basis. Scheduling placement decisions are made based on these
- * counts. The values remain static for the duration of a PTE scan.
- * faults_cpu: Track the nodes the process was running on when a NUMA
- * hinting fault was incurred.
- * faults_memory_buffer and faults_cpu_buffer: Record faults per node
- * during the current scan window. When the scan completes, the counts
- * in faults_memory and faults_cpu decay and these values are copied.
- */
- unsigned long *numa_faults;
- unsigned long total_numa_faults;
-
- /*
- * numa_faults_locality tracks if faults recorded during the last
- * scan window were remote/local or failed to migrate. The task scan
- * period is adapted based on the locality of the faults with different
- * weights depending on whether they were shared or private faults
- */
- unsigned long numa_faults_locality[3];
-
- unsigned long numa_pages_migrated;
- #endif /* CONFIG_NUMA_BALANCING */
-
- #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- struct tlbflush_unmap_batch tlb_ubc;
- #endif
-
- struct rcu_head rcu;
-
- /*
- * cache last used pipe for splice
- */
- struct pipe_inode_info *splice_pipe; //管道
-
- struct page_frag task_frag;
- // 延迟计数
- #ifdef CONFIG_TASK_DELAY_ACCT
- struct task_delay_info *delays;
- #endif
- #ifdef CONFIG_FAULT_INJECTION
- int make_it_fail;
- #endif

进程复制
内核使用了写时复制(Copy-On-Write,COW)技术,以防止在fork执行时将父进程的所有数据 复制到子进程。在调用fork时,内核通常对父进程的每个内存页,都为子进程创建一个相同的副本。
问题:主进程修改页z的数据,此时会发生父子进程在内存分离。
只有在不得不复制数据内容时采取复制数据内容,这就是写时复制的核心思想,可以看到因为修改页z导致子进程不得不去复制原来页z来保证父子进程互不干扰。
内核只为新生成的子进程创建虚拟空间结构,它们来复制父进程的虚拟结构,但是不为这些结构分配物理内存,它们共享父进程的物理空间,父进程有更改相应段的行为发生时,再为子进程相应段分配物理空间。
内核线程是直接由内核本身启动的进程。内核线程实际上是将内核函数委托给独立的 进程,与系统中其他进程“并行”执行(实际上,也并不等于内核自身的执行)。内核线程经常 称之为(内核)守护进程。它们用于执行下列任务。
进程必须用exit系统调用终止。这使得内核有机会将该进程使用的资源释放回系统。见kernel/exit.c------>do_exit。简而言之, 该函数的实现就是将各个引用计数器减1,如果引用计数器归0而没有进程再使用对应的结构,那么将相应的内存区域返还给内存管理模块
- void do_exit(long code)
- {
- struct task_struct *tsk = current;
- int group_dead;
- TASKS_RCU(int tasks_rcu_i);
-
- profile_task_exit(tsk);
-
- WARN_ON(blk_needs_flush_plug(tsk));
-
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
-
- /*
- * If do_exit is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- */
- set_fs(USER_DS);
-
- ptrace_event(PTRACE_EVENT_EXIT, code);
-
- validate_creds_for_do_exit(tsk);
-
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- /*
- * We can do this unlocked here. The futex code uses
- * this flag just to verify whether the pi state
- * cleanup has been done or not. In the worst case it
- * loops once more. We pretend that the cleanup was
- * done as there is no way to return. Either the
- * OWNER_DIED bit is set by now or we push the blocked
- * task into the wait for ever nirwana as well.
- */
- tsk->flags |= PF_EXITPIDONE;
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
-
- exit_signals(tsk); /* sets PF_EXITING */
- /*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
-
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
- acct_update_integrals(tsk);
- group_dead = atomic_dec_and_test(&tsk->signal->live);
- if (group_dead) {
- hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
- if (tsk->mm)
- setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
- }
- acct_collect(code, group_dead);
- if (group_dead)
- tty_audit_exit();
- audit_free(tsk);
-
- tsk->exit_code = code;
- taskstats_exit(tsk, group_dead);
-
- exit_mm(tsk);
-
- if (group_dead)
- acct_process();
- trace_sched_process_exit(tsk);
-
- exit_sem(tsk);
- exit_shm(tsk);
- exit_files(tsk);
- exit_fs(tsk);
- if (group_dead)
- disassociate_ctty(1);
- exit_task_namespaces(tsk);
- exit_task_work(tsk);
- exit_thread();
-
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
- cgroup_exit(tsk);
-
- /*
- * FIXME: do that only when needed, using sched_exit tracepoint
- */
- flush_ptrace_hw_breakpoint(tsk);
-
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
- exit_notify(tsk, group_dead);
- proc_exit_connector(tsk);
- #ifdef CONFIG_NUMA
- task_lock(tsk);
- mpol_put(tsk->mempolicy);
- tsk->mempolicy = NULL;
- task_unlock(tsk);
- #endif
- #ifdef CONFIG_FUTEX
- if (unlikely(current->pi_state_cache))
- kfree(current->pi_state_cache);
- #endif
- /*
- * Make sure we are holding no locks:
- */
- debug_check_no_locks_held();
- /*
- * We can do this unlocked here. The futex code uses this flag
- * just to verify whether the pi state cleanup has been done
- * or not. In the worst case it loops once more.
- */
- tsk->flags |= PF_EXITPIDONE;
-
- if (tsk->io_context)
- exit_io_context(tsk);
-
- if (tsk->splice_pipe)
- free_pipe_info(tsk->splice_pipe);
-
- if (tsk->task_frag.page)
- put_page(tsk->task_frag.page);
-
- validate_creds_for_do_exit(tsk);
-
- check_stack_usage();
- preempt_disable();
- if (tsk->nr_dirtied)
- __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
- exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
-
- /*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- *
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
-
- /* causes final put_task_struct in finish_task_switch(). */
- tsk->state = TASK_DEAD;
- tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
- schedule();
- BUG();
- /* Avoid "noreturn function does return". */
- for (;;)
- cpu_relax(); /* For when BUG is null */
- }

本文介绍了进程四要素、进程生命周期,列举了task_struct数据结构的主要内容,系统调用写时复制思想,守护进程,进程退出原理等内容。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。