• 进程:execve加载流程


    续上一篇<>中分析elf解析器、解析器填充等内容后,本章分析elf可执行程序加载过程。

    目录


    1. 源码流程

    1.1 execve

    2. 源码结构

    3. 部分结构定义

    4. 扩展函数


    内容


    1. 源码流程

    1.1 execve

      execve函数用于装载一个可执行文件以进程为单位加载到内存中,execve在内核空间中的调用可以追溯到kernel_execve -> run_init_process函数,而用户空间通过SYSCALL_DEFINE3(execve…)->do_execve进入函数:

    static int do_execve(struct filename *filename,
            const char __user *const __user *__argv,
            const char __user *const __user *__envp)
    {
            struct user_arg_ptr argv = { .ptr.native = __argv };
            struct user_arg_ptr envp = { .ptr.native = __envp };
            return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
            // #define AT_FDCWD   -100   用于指示openat应使用当前工作目录(./)
    }
    ||
    \/
    static int do_execveat_common(int fd, struct filename *filename,
                                  struct user_arg_ptr argv,
                                  struct user_arg_ptr envp,
                                  int flags)
    {
            struct linux_binprm *bprm;
            int retval;
    		...
    		if ((current->flags & PF_NPROC_EXCEEDED) &&
                is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
            // 如果已经超过用户最大进程数量 并且 用户的某个(计数中的)命名空间大于最大值,返回错误
            // #define PF_NPROC_EXCEEDED       0x00001000  超出最大进程数
                    retval = -EAGAIN;
                    goto out_ret;
            }
    
    		current->flags &= ~PF_NPROC_EXCEEDED; // 去掉用户进程数量状态标志
    
    		bprm = alloc_bprm(fd, filename);  // 分配bprm
    		// 用于保存可执行文件相关信息
    		// 可执行文件路径,执行参数,环境变量等等
    
    		retval = count(argv, MAX_ARG_STRINGS);
    		// 检查参数是否有效,参数数量是否大于最大值,并捕获异常信号、KILL信号等等
    		// #define MAX_ARG_STRINGS 0x7FFFFFFF
            if (retval == 0)
                    pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                                 current->comm, bprm->filename);
    
    		bprm->argc = retval;
    
            retval = count(envp, MAX_ARG_STRINGS); // 环境变量检查
    
    		retval = bprm_stack_limits(bprm); // 检查栈是否超出限制
    
    		/* 从内核复制和参数/环境字符串到进程堆栈 */
    		retval = copy_string_kernel(bprm->filename, bprm);
    
    		bprm->exec = bprm->p;
    	
    		retval = copy_strings(bprm->envc, envp, bprm);
    
    		retval = copy_strings(bprm->argc, argv, bprm);
    
    		/* 当 argv 为空时,添加一个空字符串 ("") 作为 argv[0] 
    		以确保从 argv[1] 开始处理的混淆用户空间程序不会最终遍历 envp */
    		if (bprm->argc == 0) {
                    retval = copy_string_kernel("", bprm);
                    if (retval < 0)
                            goto out_free;
                    bprm->argc = 1;
            }
    
    		retval = bprm_execve(bprm, fd, filename, flags); // 可执行文件解析后执行
    out_free:
            free_bprm(bprm);
    
    out_ret:
            putname(filename);
            return retval;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72

      bprm_execve分析


    bprm_execve

    static int bprm_execve(struct linux_binprm *bprm,
                           int fd, struct filename *filename, int flags)
    {
            struct file *file;
            int retval;
    
            retval = prepare_bprm_creds(bprm); // bprm->cred 获取task_struct结构对象(current)的cred结构
            // cred结构(credentials)用于进程安全上下文相关
    
    		/* 确定执行调用者必须持有的提议程序的安全性 ->cred_guard_mutex
    		 以防止 PTRACE_ATTACH 或 seccomp 线程同步 */
    		check_unsafe_exec(bprm);
            current->in_execve = 1;
    
            file = do_open_execat(fd, filename, flags); // 打开可执行文件
    
    		sched_exec(); // 找到最小负载的CPU,执行可执行文件(线程)
    
    		bprm->file = file;
    
    		/* 记录从 O_CLOEXEC fd 派生的名称在执行后将不可访问
    		这允许 exec 中的代码在可执行文件未映射到解释器并且打开的文件描述符未传递给解释器时选择失败
    		与让解释器启动然后在发现可执行文件不可访问时立即失败相比,这提供了更好的用户体验 */
    		if (bprm->fdpath && get_close_on_exec(fd))
                    bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; // 执行后二进制文件名将无法访问
                    // #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 
                    // #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
    
    		/* 设置 bprm->cred 不变的部分 */
    		retval = security_bprm_creds_for_exec(bprm);
    	
    		retval = exec_binprm(bprm); // 二进制文件解析image,填充审计结构, trace跟踪打印,事件设置等等
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32

    exec_binprm分析,继续看bprm_execve函数:

    	    /* execve succeeded */
            current->fs->in_exec = 0;
            current->in_execve = 0;
            rseq_execve(current); // rseq赋空值
            acct_update_integrals(current);
            task_numa_free(current, false);
            return retval;
    ...
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    2. 源码结构


    3. 部分结构定义

      cred 任务的安全上下文

    struct cred {
    	atomic_t	usage;
    #ifdef CONFIG_DEBUG_CREDENTIALS
    	atomic_t	subscribers;	/* number of processes subscribed */
    	void		*put_addr;
    	unsigned	magic;
    #define CRED_MAGIC	0x43736564
    #define CRED_MAGIC_DEAD	0x44656144
    #endif
    	kuid_t		uid;		/* real UID of the task */
    	kgid_t		gid;		/* real GID of the task */
    	kuid_t		suid;		/* saved UID of the task */
    	kgid_t		sgid;		/* saved GID of the task */
    	kuid_t		euid;		/* effective UID of the task */
    	kgid_t		egid;		/* effective GID of the task */
    	kuid_t		fsuid;		/* UID for VFS ops */
    	kgid_t		fsgid;		/* GID for VFS ops */
    	unsigned	securebits;	/* SUID-less security management */
    	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
    	kernel_cap_t	cap_permitted;	/* caps we're permitted */
    	kernel_cap_t	cap_effective;	/* caps we can actually use */
    	kernel_cap_t	cap_bset;	/* capability bounding set */
    	kernel_cap_t	cap_ambient;	/* Ambient capability set */
    #ifdef CONFIG_KEYS
    	unsigned char	jit_keyring;	/* default keyring to attach requested
    					 * keys to */
    	struct key	*session_keyring; /* keyring inherited over fork */
    	struct key	*process_keyring; /* keyring private to this process */
    	struct key	*thread_keyring; /* keyring private to this thread */
    	struct key	*request_key_auth; /* assumed request_key authority */
    #endif
    #ifdef CONFIG_SECURITY
    	void		*security;	/* LSM security */
    #endif
    	struct user_struct *user;	/* real user ID subscription */
    	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
    	struct ucounts *ucounts;
    	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
    	/* RCU deletion */
    	union {
    		int non_rcu;			/* Can we skip RCU deletion? */
    		struct rcu_head	rcu;		/* RCU deletion hook */
    	};
    } __randomize_layout;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44

      nameidata 保存文件相关的信息,这是一个临时结构,仅仅用在寻找目标节点(或创建临时文件)的过程

    struct nameidata {
    	struct path	path;
    	struct qstr	last;
    	struct path	root;
    	struct inode	*inode; /* path.dentry.d_inode */
    	unsigned int	flags, state;
    	unsigned	seq, m_seq, r_seq;
    	int		last_type;
    	unsigned	depth;
    	int		total_link_count;
    	struct saved {
    		struct path link;
    		struct delayed_call done;
    		const char *name;
    		unsigned seq;
    	} *stack, internal[EMBEDDED_LEVELS];
    	struct filename	*name;
    	struct nameidata *saved;
    	unsigned	root_seq;
    	int		dfd;
    	kuid_t		dir_uid;
    	umode_t		dir_mode;
    } __randomize_layout;
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23

      migration_arg 线程迁移到目标cpu

    struct migration_arg {
    	struct task_struct		*task;
    	int				dest_cpu;
    	struct set_affinity_pending	*pending;
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5

      audit_context per-task的审计上下文

    struct audit_context {
    	int		    dummy;	/* must be the first element */
    	enum {
    		AUDIT_CTX_UNUSED,	/* audit_context is currently unused */
    		AUDIT_CTX_SYSCALL,	/* in use by syscall */
    		AUDIT_CTX_URING,	/* in use by io_uring */
    	} context;
    	enum audit_state    state, current_state;
    	unsigned int	    serial;     /* serial number for record */
    	int		    major;      /* syscall number */
    	int		    uring_op;   /* uring operation */
    	struct timespec64   ctime;      /* time of syscall entry */
    	unsigned long	    argv[4];    /* syscall arguments */
    	long		    return_code;/* syscall return code */
    	u64		    prio;
    	int		    return_valid; /* return code is valid */
    	/*
    	 * The names_list is the list of all audit_names collected during this
    	 * syscall.  The first AUDIT_NAMES entries in the names_list will
    	 * actually be from the preallocated_names array for performance
    	 * reasons.  Except during allocation they should never be referenced
    	 * through the preallocated_names array and should only be found/used
    	 * by running the names_list.
    	 */
    	struct audit_names  preallocated_names[AUDIT_NAMES];
    	int		    name_count; /* total records in names_list */
    	struct list_head    names_list;	/* struct audit_names->list anchor */
    	char		    *filterkey;	/* key for rule that triggered record */
    	struct path	    pwd;
    	struct audit_aux_data *aux;
    	struct audit_aux_data *aux_pids;
    	struct sockaddr_storage *sockaddr;
    	size_t sockaddr_len;
    				/* Save things to print about task_struct */
    	pid_t		    pid, ppid;
    	kuid_t		    uid, euid, suid, fsuid;
    	kgid_t		    gid, egid, sgid, fsgid;
    	unsigned long	    personality;
    	int		    arch;
    
    	pid_t		    target_pid;
    	kuid_t		    target_auid;
    	kuid_t		    target_uid;
    	unsigned int	    target_sessionid;
    	u32		    target_sid;
    	char		    target_comm[TASK_COMM_LEN];
    
    	struct audit_tree_refs *trees, *first_trees;
    	struct list_head killed_trees;
    	int tree_count;
    
    	int type;
    	union {
    		struct {
    			int nargs;
    			long args[6];
    		} socketcall;
    		struct {
    			kuid_t			uid;
    			kgid_t			gid;
    			umode_t			mode;
    			u32			osid;
    			int			has_perm;
    			uid_t			perm_uid;
    			gid_t			perm_gid;
    			umode_t			perm_mode;
    			unsigned long		qbytes;
    		} ipc;
    		struct {
    			mqd_t			mqdes;
    			struct mq_attr		mqstat;
    		} mq_getsetattr;
    		struct {
    			mqd_t			mqdes;
    			int			sigev_signo;
    		} mq_notify;
    		struct {
    			mqd_t			mqdes;
    			size_t			msg_len;
    			unsigned int		msg_prio;
    			struct timespec64	abs_timeout;
    		} mq_sendrecv;
    		struct {
    			int			oflag;
    			umode_t			mode;
    			struct mq_attr		attr;
    		} mq_open;
    		struct {
    			pid_t			pid;
    			struct audit_cap_data	cap;
    		} capset;
    		struct {
    			int			fd;
    			int			flags;
    		} mmap;
    		struct open_how openat2;
    		struct {
    			int			argc;
    		} execve;
    		struct {
    			char			*name;
    		} module;
    		struct {
    			struct audit_ntp_data	ntp_data;
    			struct timespec64	tk_injoffset;
    		} time;
    	};
    	int fds[2];
    	struct audit_proctitle proctitle;
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110

      rseq 可重新启动的序列

    /* struct rseq 在 4 * 8 字节上对齐,以确保它始终包含在单个缓存行中,
    	允许每个线程有一个 struct rseq,
    	rseq的主要使用场景之一是获取执行当前线程的CPU编号,
    	通常也就是指向per-cpu数据结构的索引值 */
    
    struct rseq {
    	__u32 cpu_id_start;
    	__u32 cpu_id;
    	__u64 rseq_cs;
    	__u32 flags;
    } __attribute__((aligned(4 * sizeof(__u64))));
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    4. 扩展函数

    exec_binprm

      二进制文件解析image,填充审计结构, trace跟踪打印,事件设置等等

    static int exec_binprm(struct linux_binprm *bprm)
    {
    	...
    	/* 这允许在严重失败之前进行 4 级 binfmt 重写 */
    	for (depth = 0;; depth++) {
                    struct file *exec;
                    if (depth > 5)
                            return -ELOOP;
    
    				/* 循环二进制格式处理程序列表,直到识别image */
                    ret = search_binary_handler(bprm);
                    if (ret < 0)
                            return ret;
                    if (!bprm->interpreter)
                            break;
    
                    exec = bprm->file;
                    bprm->file = bprm->interpreter;
                    bprm->interpreter = NULL;
    
                    allow_write_access(exec); // i_writecount增加1
                    if (unlikely(bprm->have_execfd)) {
                            if (bprm->executable) { // 传递给解释器的可执行文件,已经存在
                                    fput(exec);
                                    return -ENOEXEC;
                            }
                            bprm->executable = exec;
                    } else
    						fput(exec);
            }
    
            audit_bprm(bprm); // 审计结构填充参数
            // context->type = AUDIT_EXECVE; // 执行参数
            // context->execve.argc = bprm->argc;
    
            trace_sched_process_exec(current, old_pid, bprm);
            // trace用于跟踪打印信息,俗称“插桩”,通过定义类型、变量和结构,输出相关信息
            // TRACE_EVENT(sched_process_exec ...
            
            ptrace_event(PTRACE_EVENT_EXEC, old_vpid); // 通过SIGTRAP遗留EXEC报告
            proc_exec_connector(current); // 流程事件连接器
            return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
  • 相关阅读:
    算法的时间复杂度和空间复杂度
    jeecgboot-vue3-AntDesign笔记(九)——treeSelect树形选择组件的使用(异步加载)
    Android开发之打包APK详解
    Oracle中数据迁移的工具
    MySQL(4)
    无线传感器网络(双语)复习
    生成用于目标检测任务的合成图像教程:使用Blender、Python和3D资产
    Unity VR开发教程 OpenXR+XR Interaction Toolkit 2.1.1 (六)手与物品交互(触摸、抓取)
    Java Spring Boot 写 API 接口
    基于Huffman码实现的编码译码系统
  • 原文地址:https://blog.csdn.net/a29562268/article/details/126166938