续上一篇<
> 中分析elf解析器、解析器填充等内容后,本章分析elf可执行程序加载过程。
execve函数用于装载一个可执行文件以进程为单位加载到内存中,execve在内核空间中的调用可以追溯到kernel_execve -> run_init_process函数,而用户空间通过SYSCALL_DEFINE3(execve…)->do_execve进入函数:
static int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
// #define AT_FDCWD -100 用于指示openat应使用当前工作目录(./)
}
||
\/
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
struct linux_binprm *bprm;
int retval;
...
if ((current->flags & PF_NPROC_EXCEEDED) &&
is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
// 如果已经超过用户最大进程数量 并且 用户的某个(计数中的)命名空间大于最大值,返回错误
// #define PF_NPROC_EXCEEDED 0x00001000 超出最大进程数
retval = -EAGAIN;
goto out_ret;
}
current->flags &= ~PF_NPROC_EXCEEDED; // 去掉用户进程数量状态标志
bprm = alloc_bprm(fd, filename); // 分配bprm
// 用于保存可执行文件相关信息
// 可执行文件路径,执行参数,环境变量等等
retval = count(argv, MAX_ARG_STRINGS);
// 检查参数是否有效,参数数量是否大于最大值,并捕获异常信号、KILL信号等等
// #define MAX_ARG_STRINGS 0x7FFFFFFF
if (retval == 0)
pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
current->comm, bprm->filename);
bprm->argc = retval;
retval = count(envp, MAX_ARG_STRINGS); // 环境变量检查
retval = bprm_stack_limits(bprm); // 检查栈是否超出限制
/* 从内核复制和参数/环境字符串到进程堆栈 */
retval = copy_string_kernel(bprm->filename, bprm);
bprm->exec = bprm->p;
retval = copy_strings(bprm->envc, envp, bprm);
retval = copy_strings(bprm->argc, argv, bprm);
/* 当 argv 为空时,添加一个空字符串 ("") 作为 argv[0]
以确保从 argv[1] 开始处理的混淆用户空间程序不会最终遍历 envp */
if (bprm->argc == 0) {
retval = copy_string_kernel("", bprm);
if (retval < 0)
goto out_free;
bprm->argc = 1;
}
retval = bprm_execve(bprm, fd, filename, flags); // 可执行文件解析后执行
out_free:
free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
static int bprm_execve(struct linux_binprm *bprm,
int fd, struct filename *filename, int flags)
{
struct file *file;
int retval;
retval = prepare_bprm_creds(bprm); // bprm->cred 获取task_struct结构对象(current)的cred结构
// cred结构(credentials)用于进程安全上下文相关
/* 确定执行调用者必须持有的提议程序的安全性 ->cred_guard_mutex
以防止 PTRACE_ATTACH 或 seccomp 线程同步 */
check_unsafe_exec(bprm);
current->in_execve = 1;
file = do_open_execat(fd, filename, flags); // 打开可执行文件
sched_exec(); // 找到最小负载的CPU,执行可执行文件(线程)
bprm->file = file;
/* 记录从 O_CLOEXEC fd 派生的名称在执行后将不可访问
这允许 exec 中的代码在可执行文件未映射到解释器并且打开的文件描述符未传递给解释器时选择失败
与让解释器启动然后在发现可执行文件不可访问时立即失败相比,这提供了更好的用户体验 */
if (bprm->fdpath && get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; // 执行后二进制文件名将无法访问
// #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
// #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
/* 设置 bprm->cred 不变的部分 */
retval = security_bprm_creds_for_exec(bprm);
retval = exec_binprm(bprm); // 二进制文件解析image,填充审计结构, trace跟踪打印,事件设置等等
exec_binprm分析,继续看bprm_execve函数:
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current); // rseq赋空值
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
...
}
cred 任务的安全上下文
struct cred {
atomic_t usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_t subscribers; /* number of processes subscribed */
void *put_addr;
unsigned magic;
#define CRED_MAGIC 0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
kuid_t uid; /* real UID of the task */
kgid_t gid; /* real GID of the task */
kuid_t suid; /* saved UID of the task */
kgid_t sgid; /* saved GID of the task */
kuid_t euid; /* effective UID of the task */
kgid_t egid; /* effective GID of the task */
kuid_t fsuid; /* UID for VFS ops */
kgid_t fsgid; /* GID for VFS ops */
unsigned securebits; /* SUID-less security management */
kernel_cap_t cap_inheritable; /* caps our children can inherit */
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
struct key *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
void *security; /* LSM security */
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct ucounts *ucounts;
struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */
union {
int non_rcu; /* Can we skip RCU deletion? */
struct rcu_head rcu; /* RCU deletion hook */
};
} __randomize_layout;
nameidata 保存文件相关的信息,这是一个临时结构,仅仅用在寻找目标节点(或创建临时文件)的过程
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags, state;
unsigned seq, m_seq, r_seq;
int last_type;
unsigned depth;
int total_link_count;
struct saved {
struct path link;
struct delayed_call done;
const char *name;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
unsigned root_seq;
int dfd;
kuid_t dir_uid;
umode_t dir_mode;
} __randomize_layout;
migration_arg 线程迁移到目标cpu
struct migration_arg {
struct task_struct *task;
int dest_cpu;
struct set_affinity_pending *pending;
};
audit_context per-task的审计上下文
struct audit_context {
int dummy; /* must be the first element */
enum {
AUDIT_CTX_UNUSED, /* audit_context is currently unused */
AUDIT_CTX_SYSCALL, /* in use by syscall */
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
/*
* The names_list is the list of all audit_names collected during this
* syscall. The first AUDIT_NAMES entries in the names_list will
* actually be from the preallocated_names array for performance
* reasons. Except during allocation they should never be referenced
* through the preallocated_names array and should only be found/used
* by running the names_list.
*/
struct audit_names preallocated_names[AUDIT_NAMES];
int name_count; /* total records in names_list */
struct list_head names_list; /* struct audit_names->list anchor */
char *filterkey; /* key for rule that triggered record */
struct path pwd;
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
pid_t pid, ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
int arch;
pid_t target_pid;
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
u32 target_sid;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
struct list_head killed_trees;
int tree_count;
int type;
union {
struct {
int nargs;
long args[6];
} socketcall;
struct {
kuid_t uid;
kgid_t gid;
umode_t mode;
u32 osid;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
umode_t perm_mode;
unsigned long qbytes;
} ipc;
struct {
mqd_t mqdes;
struct mq_attr mqstat;
} mq_getsetattr;
struct {
mqd_t mqdes;
int sigev_signo;
} mq_notify;
struct {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
umode_t mode;
struct mq_attr attr;
} mq_open;
struct {
pid_t pid;
struct audit_cap_data cap;
} capset;
struct {
int fd;
int flags;
} mmap;
struct open_how openat2;
struct {
int argc;
} execve;
struct {
char *name;
} module;
struct {
struct audit_ntp_data ntp_data;
struct timespec64 tk_injoffset;
} time;
};
int fds[2];
struct audit_proctitle proctitle;
};
rseq 可重新启动的序列
/* struct rseq 在 4 * 8 字节上对齐,以确保它始终包含在单个缓存行中,
允许每个线程有一个 struct rseq,
rseq的主要使用场景之一是获取执行当前线程的CPU编号,
通常也就是指向per-cpu数据结构的索引值 */
struct rseq {
__u32 cpu_id_start;
__u32 cpu_id;
__u64 rseq_cs;
__u32 flags;
} __attribute__((aligned(4 * sizeof(__u64))));
二进制文件解析image,填充审计结构, trace跟踪打印,事件设置等等
static int exec_binprm(struct linux_binprm *bprm)
{
...
/* 这允许在严重失败之前进行 4 级 binfmt 重写 */
for (depth = 0;; depth++) {
struct file *exec;
if (depth > 5)
return -ELOOP;
/* 循环二进制格式处理程序列表,直到识别image */
ret = search_binary_handler(bprm);
if (ret < 0)
return ret;
if (!bprm->interpreter)
break;
exec = bprm->file;
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
allow_write_access(exec); // i_writecount增加1
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) { // 传递给解释器的可执行文件,已经存在
fput(exec);
return -ENOEXEC;
}
bprm->executable = exec;
} else
fput(exec);
}
audit_bprm(bprm); // 审计结构填充参数
// context->type = AUDIT_EXECVE; // 执行参数
// context->execve.argc = bprm->argc;
trace_sched_process_exec(current, old_pid, bprm);
// trace用于跟踪打印信息,俗称“插桩”,通过定义类型、变量和结构,输出相关信息
// TRACE_EVENT(sched_process_exec ...
ptrace_event(PTRACE_EVENT_EXEC, old_vpid); // 通过SIGTRAP遗留EXEC报告
proc_exec_connector(current); // 流程事件连接器
return 0;
}