Number of editions to be surveyed

linux-4.2(mainline)

Where do you read from?

Really big. At first, I'm at a loss as to where to read.

I thought about where to read it, but I feel that it seems that it will spread more if you enter from fork or exec rather than boot. I mean, I wonder if I want to read from around here. So I decided to start by finding out what the system entry says. When I searched on the net, I found the following site.

--[internal22-49-System Call Entry --Linux Kernel Documents Wiki --Linux Kernel Documents --SYSTEM](https://en.osdn.jp/projects/linux-kernel-docs/wiki/internal22-49-%E3%82] % B7% E3% 82% B9% E3% 83% 86% E3% 83% A0% E3% 82% B3% E3% 83% BC% E3% 83% AB% E3% 82% A8% E3% 83% B3 % E3% 83% 88% E3% 83% AA)

The system call entry table seems to be sys_call_table. Definition of sys_call_table


#undef __SYSCALL
#define __SYSCALL(nr, call) [nr] = (call),
void *sys_call_table[NR_syscalls] = {
        [0 ... NR_syscalls-1] = sys_ni_syscall,
#include <asm/unistd.h>
};

The contents of the entry table are in #include <asm / unistd.h>. Take a look at this. Before that, in the definition of sys_call_table, there is [0 ... NR_syscalls-1] = sys_ni_syscall,. Also __SYSCALL (nr, call) [nr] = (call),. This should be the code that initializes / sets the elements of the array. Let's experiment with a little simple sample code.

#include <stdio.h>

#define TBLSIZ  10
int     tbl[TBLSIZ] = {
	[0 ... TBLSIZ-1] = 123,
	[7] = 777,
};
 
int main()
{
	int     i, c = sizeof(tbl) / sizeof(tbl[0]);
	for (i = 0; i < c; i++) {
		printf("tbl[%d]=%d\n", i, tbl[i]);
	}
}

The execution result is as follows.


kou77@ubuntu:~/test$ gcc tes015.c
kou77@ubuntu:~/test$ ./a.out
tbl[0]=123
tbl[1]=123
tbl[2]=123
tbl[3]=123
tbl[4]=123
tbl[5]=123
tbl[6]=123
tbl[7]=777
tbl[8]=123
tbl[9]=123

sys_call_table is an array of void *, but since the setting value is easy to understand in the sample code, I made it an array of int. As a result, as I expected. (Index 7 contains 777, otherwise 123 is included)

Going back to the survey, refer to the definition of #include <asm / unistd.h>. Each entry can be expected to use the __SYSCALL macro.

If you try to check the definition of asm / unistd.h, you will find many asm / unistd.h files under the arch folder.

./arch/unicore32/include/uapi/asm/unistd.h
./arch/powerpc/include/asm/unistd.h
./arch/powerpc/include/uapi/asm/unistd.h
./arch/tile/include/asm/unistd.h
./arch/tile/include/uapi/asm/unistd.h
./arch/nios2/include/uapi/asm/unistd.h
./arch/openrisc/include/uapi/asm/unistd.h
./arch/microblaze/include/asm/unistd.h
./arch/microblaze/include/uapi/asm/unistd.h
./arch/arm/include/asm/unistd.h
./arch/arm/include/uapi/asm/unistd.h
./arch/c6x/include/uapi/asm/unistd.h
./arch/xtensa/include/asm/unistd.h
./arch/xtensa/include/uapi/asm/unistd.h
./arch/parisc/include/asm/unistd.h
./arch/parisc/include/uapi/asm/unistd.h
./arch/mips/include/asm/unistd.h
./arch/mips/include/uapi/asm/unistd.h
./arch/x86/include/asm/unistd.h
./arch/x86/include/uapi/asm/unistd.h
./arch/m32r/include/asm/unistd.h
./arch/m32r/include/uapi/asm/unistd.h
./arch/h8300/include/uapi/asm/unistd.h
./arch/s390/include/asm/unistd.h
./arch/s390/include/uapi/asm/unistd.h
./arch/hexagon/include/uapi/asm/unistd.h
./arch/mn10300/include/asm/unistd.h
./arch/mn10300/include/uapi/asm/unistd.h
./arch/metag/include/asm/unistd.h
./arch/metag/include/uapi/asm/unistd.h
./arch/avr32/include/asm/unistd.h
./arch/avr32/include/uapi/asm/unistd.h
./arch/sparc/include/asm/unistd.h
./arch/sparc/include/uapi/asm/unistd.h
./arch/sh/include/asm/unistd.h
./arch/sh/include/uapi/asm/unistd.h
./arch/blackfin/include/asm/unistd.h
./arch/blackfin/include/uapi/asm/unistd.h
./arch/m68k/include/asm/unistd.h
./arch/m68k/include/uapi/asm/unistd.h
./arch/frv/include/asm/unistd.h
./arch/frv/include/uapi/asm/unistd.h
./arch/score/include/uapi/asm/unistd.h
./arch/arm64/include/asm/unistd.h
./arch/arm64/include/uapi/asm/unistd.h
./arch/cris/include/asm/unistd.h
./arch/cris/include/arch-v32/arch/unistd.h
./arch/cris/include/uapi/asm/unistd.h
./arch/cris/include/arch-v10/arch/unistd.h
./arch/ia64/include/asm/unistd.h
./arch/ia64/include/uapi/asm/unistd.h
./arch/alpha/include/asm/unistd.h
./arch/alpha/include/uapi/asm/unistd.h
./arch/arc/include/uapi/asm/unistd.h
./include/asm-generic/unistd.h
./include/uapi/asm-generic/unistd.h
./include/uapi/linux/unistd.h

Since the files in arch are probably architecture-dependent files, I will search for them from the commonly referenced files. First, asm / unistd.h is the content of the definition of sys_call_table, and it seems that the description using the __SYSCALL macro is listed. If you look at ./include/asm-generic/unistd.h,

#include <uapi/asm-generic/unistd.h>
#include <linux/export.h>
 
/*
 * These are required system calls, we should
 * invert the logic eventually and let them
 * be selected by default.
 */
#if __BITS_PER_LONG == 32
#define __ARCH_WANT_STAT64
#define __ARCH_WANT_SYS_LLSEEK
#endif

Continue with ./include/uapi/asm-generic/unistd.h and ./include/linux/export.h. ./Include/uapi/asm-generic/unistd.h is a bit large, as system call entries should be defined. There are definitions such as macros in ./include/linux/export.h, and there seems to be no description of the contents of sys_call_table, so Aside from this, here is an excerpt of the code in ./include/uapi/asm-generic/unistd.h.

#include <asm/bitsperlong.h>
 
/*
 * This file contains the system call numbers, based on the
 * layout of the x86-64 architecture, which embeds the
 * pointer to the syscall in the table.
 *
 * As a basic principle, no duplication of functionality
 * should be added, e.g. we don't use lseek when llseek
 * is present. New architectures should use this file
 * and implement the less feature-full calls in user space.
 */
 
#ifndef __SYSCALL
#define __SYSCALL(x, y)
#endif
 
#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
#else
#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
#endif
 
#ifdef __SYSCALL_COMPAT
#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp)
#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp)
#else
#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys)
#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64)
#endif
/*Omitted on the way ...*/
#define __NR_uselib 1077
__SYSCALL(__NR_uselib, sys_uselib)
#define __NR__sysctl 1078
__SYSCALL(__NR__sysctl, sys_sysctl)
 
#define __NR_fork 1079
#ifdef CONFIG_MMU
__SYSCALL(__NR_fork, sys_fork)
#else
__SYSCALL(__NR_fork, sys_ni_syscall)
#endif /* CONFIG_MMU */
/*The following is omitted ...*/

In the definition of the fork entry, we see define in CONFIG_MMU. I don't know what the (make) config will be, but it's unlikely that the fork isn't in the system call entry, so For normally working linux, CONFIG_MMU may be defined.

By the way, the definition of sys_ni_syscall is as follows. kernel\sys_ni.c(14): asmlinkage long sys_ni_syscall(void)

/*
 * Non-implemented system calls get redirected here.
 */
asmlinkage long sys_ni_syscall(void)
{
        return -ENOSYS;
}

The above is the function that will be called if the entry is not implemented. Let's check the registered function using the fork entry in the above code excerpt as a sample.

Definition of sys_fork

I searched with sys_fork, but I couldn't find anything that seems to be a function definition.

kou77@ubuntu:~/linux-4.2$ find . \( -name \*.c -o -name \*.h \) -exec grep 'sys_fork' {} /dev/null \;
./arch/openrisc/include/asm/syscalls.h:asmlinkage long __sys_fork(void);
./arch/openrisc/include/asm/syscalls.h:#define sys_fork __sys_fork
./arch/mips/kernel/syscall.c:save_static_function(sys_fork);
./arch/x86/um/sys_call_table_64.c:#define stub_fork sys_fork
./arch/x86/include/generated/asm/syscalls_32.h:__SYSCALL_I386(2, sys_fork, stub32_fork)
./arch/sparc/kernel/process_32.c: *       sys_fork invocation and when we reach here
./arch/arm64/include/asm/unistd32.h:__SYSCALL(__NR_fork, sys_fork)
./include/linux/syscalls.h:asmlinkage long sys_fork(void);
./include/uapi/asm-generic/unistd.h:__SYSCALL(__NR_fork, sys_fork)

There is some definition under arch, but there is no definition of the function body. The arch will be different. As a result of searching, thinking that this is probably not directly visible with macros etc., the following was found. ./kernel/fork.c:SYSCALL_DEFINE0(fork)

The whole of this definition is

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

The definition of the SYSCALL_DEFINE0 macro is as follows. ./include/linux/syscalls.h:178:#define SYSCALL_DEFINE0(sname) \

#define SYSCALL_DEFINE0(sname)                                  \
        SYSCALL_METADATA(_##sname, 0);                          \
        asmlinkage long sys_##sname(void)

Except for SYSCALL_METADATA, the asmlinkage long sys_## sname (void) part is Taking SYSCALL_DEFINE0 (fork) as an example, it replaces asmlinkage long sys_fork (void). Finally, I found the definition of the sys_fork function. asmlinkage can be called and referenced from a source written in the C compiler when compiled in C ++. It seems to be a description that allows the symbol to be written out. The definition of asmlinkage is as follows.

./tools/lib/lockdep/uinclude/linux/lockdep.h:13:#define asmlinkage
./arch/x86/include/asm/linkage.h:10:#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
./arch/mn10300/include/asm/linkage.h:15:#define asmlinkage
./arch/ia64/include/asm/linkage.h:6:#define asmlinkage CPP_ASMLINKAGE __attribute__((syscall_linkage))
./include/linux/linkage.h:21:#define asmlinkage CPP_ASMLINKAGE

Judged that the definition of ./include/linux/linkage.h is generally valid. For x86, the definition of #define asmlinkage CPP_ASMLINKAGE attribute ((regparm (0))) may be valid. I don't know. regparm seems to pass arguments to registers. (Let's check separately)

The definition of CPP_ASMLINKAGE is as follows.

#ifdef __cplusplus
#define CPP_ASMLINKAGE extern "C"
#else
#define CPP_ASMLINKAGE
#endif

fork (body) code

Take a quick look at the code on the main body of the fork. The definition of _do_fork is as follows. ./kernel/fork.c:1679:long _do_fork(unsigned long clone_flags,

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long _do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr,
              unsigned long tls)
{
        struct task_struct *p;
        int trace = 0;
        long nr;
 
        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;
 
                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }
 
        p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace, tls);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        if (!IS_ERR(p)) {
                struct completion vfork;
                struct pid *pid;
 
                trace_sched_process_fork(current, p);
 
                pid = get_task_pid(p, PIDTYPE_PID);
                nr = pid_vnr(pid);
 
                if (clone_flags & CLONE_PARENT_SETTID)
                        put_user(nr, parent_tidptr);
 
                if (clone_flags & CLONE_VFORK) {
                        p->vfork_done = &vfork;
                        init_completion(&vfork);
                        get_task_struct(p);
                }
 
                wake_up_new_task(p);
 
                /* forking complete and child started to run, tell ptracer */
                if (unlikely(trace))
                        ptrace_event_pid(trace, pid);
 
                if (clone_flags & CLONE_VFORK) {
                        if (!wait_for_vfork_done(p, &vfork))
                                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
                }
 
                put_pid(pid);
        } else {
                nr = PTR_ERR(p);
        }
        return nr;
}

Certainly this seems to be the main body. In this article, fork analysis isn't the goal, so I won't go any further. Fork has no arguments, but what about the functions registered in the system call entry that has arguments?

Let's take a sample of setgid, which seems to be a relatively simple implementation.

/*
 * setgid() is implemented like SysV w/ SAVED_IDS
 *
 * SMP: Same implicit races as above.
 */
SYSCALL_DEFINE1(setgid, gid_t, gid)
{  
/*The following is omitted ...*/

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)                          \
        SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __SYSCALL_DEFINEx(x, name, ...)                                 \
        asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
                __attribute__((alias(__stringify(SyS##name))));         \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
        {                                                               \
                long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
                __MAP(x,__SC_TEST,__VA_ARGS__);                         \
                __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
                return ret;                                             \
        }                                                               \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

This is chewy. To put it simply in the range I checked, in the case of setgid, the macro is expanded with the following feeling.

--Define SyS_setgid with another name for sys_setgid. sys_setgid has no substance. Instead of, the entity of SyS_setgid is defined. --SyS_setgid is defined by alias and cannot be referenced from other sources. (Default is weak?) The name of sys_setgid can be referenced from other sources. --SYSC_setgid is called from SyS_setgid, and the contents of the SYSC_setgid function are ./kernel/fork.c: Leads to the definition of SYSCALL_DEFINE0 (fork). Kanji.

I haven't looked at it in detail yet, but the way to define __MAP is very interesting. The implementation here is interesting, so I may explain it in another article after understanding the contents a little more.

I looked at system call entries. (linux source analysis)

Number of editions to be surveyed

Where do you read from?

Definition of sys_fork

fork (body) code