FreeBSD 5内核源代码分析之系统调用过程_freebsd源码入口函数-程序员宅基地

转自：https://www.freebsdchina.org/forum/viewtopic.php?t=11068
by wheelz
--------------------------

注:由于code是BBCode的关键字,在某些地方将程序中的变量code改写为_code

系统调用开始于用户程序，接着到达libc进行参数的包装，然后调用内核提供的机制进入内核。

内核提供的系统调用进入内核的方式有几种，包括lcall $X, y方式和
int 0x80方式。其实现都在sys/i386/i386/exception.s中。

我们看最常见的int 0x80入口。

1，int 0x80中断向量的初始化。
------------------

在i386CPU的初始化过程中，会调用函数init386() /*XXX*/
其中有：

代码:

 
 (sys/i386/i386/machdep.c) 
 ----------------------------------- 
     setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 
        GSEL(GCODE_SEL, SEL_KPL)); 
 ----------------------------------- 

在这里设置好int80的中断向量表。

代码:

 
 (sys/i386/include/segments.h) 
 --------------------------------- 
 #define   IDT_SYSCALL   0x80   /* System Call Interrupt Vector */ 
 
 #define   SDT_SYS386TGT   15   /* system 386 trap gate */ 
 
 #define   SEL_UPL   3      /* user priority level */ 
 
 #define   GSEL(s,r)   (((s)<<3) | r)         /* a global selector */ 
 
 #define   GCODE_SEL   1   /* Kernel Code Descriptor */ 
 
 #define   SEL_KPL   0      /* kernel priority level */ 
 ---------------------------------- 

代码:

 
 (sys/i386/i386/machdep.c) 
 ----------------------------------- 
 void 
 setidt(idx, func, typ, dpl, selec) 
    int idx; 
    inthand_t *func; 
    int typ; 
    int dpl; 
    int selec; 
 { 
    struct gate_descriptor *ip; 
 
    ip = idt + idx; 
    ip->gd_looffset = (int)func; 
    ip->gd_selector = selec; 
    ip->gd_stkcpy = 0; 
    ip->gd_xx = 0; 
    ip->gd_type = typ; 
    ip->gd_dpl = dpl; 
    ip->gd_p = 1; 
    ip->gd_hioffset = ((int)func)>>16 ; 
 } 
 ------------------------------------ 

2，int0x80_syscall
------------------

系统调用的入口是int0x80_syscall，在sys/i386/i386/exception.s中。
它其实是一个包装函数，用汇编写成，其目的是为调用C函数syscall()做准备。

代码:

 
 void 
 syscall(frame) 
    struct trapframe frame; 

由于系统调用最终是要调用syscall()这个函数，
因此需要为它准备一个调用栈，包括参数frame，其类型为struct trapframe

代码:

 
 /* 
  * Exception/Trap Stack Frame 
  */ 
 
 struct trapframe { 
    int   tf_fs; 
    int   tf_es; 
    int   tf_ds; 
    int   tf_edi; 
    int   tf_esi; 
    int   tf_ebp; 
    int   tf_isp; 
    int   tf_ebx; 
    int   tf_edx; 
    int   tf_ecx; 
    int   tf_eax; 
    int   tf_trapno; 
    /* below portion defined in 386 hardware */ 
    int   tf_err; 
    int   tf_eip; 
    int   tf_cs; 
    int   tf_eflags; 
    /* below only when crossing rings (e.g. user to kernel) */ 
    int   tf_esp; 
    int   tf_ss; 
 }; 

这个trapframe实际上就是保存在核心栈上的用户态寄存器的状态，当从
系统调用返回时，需要从这里恢复寄存器等上下文内容。同时，它又是
函数syscall()的参数，这样在syscall()函数里面就可以方便地操纵返回后
的用户进程上下文状态。

我们来看具体的int0x80_syscall。

代码:

 
 /* 
  * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) 
  * 
  * Even though the name says 'int0x80', this is actually a TGT (trap gate) 
  * rather then an IGT (interrupt gate).  Thus interrupts are enabled on 
  * entry just as they are for a normal syscall. 
  */ 
    SUPERALIGN_TEXT 
 IDTVEC(int0x80_syscall) 
    pushl   $2         /* sizeof "int 0x80" */ 

对照struct trapframe可知，此句赋值frame->tf_err=2，记录int 0x80指令的长度，
因为有可能系统调用需要重新执行(系统调用返回ERESTART的话内核会自动重新执行)，
需要%eip的值减去int 0x80的指令长度。

代码:

 
    subl   $4,%esp         /* skip over tf_trapno */ 
    pushal 
    pushl   %ds 
    pushl   %es 
    pushl   %fs 

对照struct trapframe又可知，此时syscall(frame)的参数在堆栈上已经构造好。

代码:

 
    mov   $KDSEL,%ax      /* switch to kernel segments */ 
    mov   %ax,%ds 
    mov   %ax,%es 
    mov   $KPSEL,%ax 
    mov   %ax,%fs 

切换到内核数据段，并将%fs设置好，%fs指向一个per cpu的段，内存CPU相关的数据，
比如当前线程的pcb和struct thread指针。

代码:

 
    FAKE_MCOUNT(13*4(%esp)) 
    call   syscall 
    MEXITCOUNT 
    jmp   doreti 

调用syscall()函数。syscall()返回后，
将转到doreti(也在sys/i386/i386/exception.s中)，判断是否可以执行AST，
最后结束整个系统调用。

3，syscall()函数
---------------

我们接着看syscall()函数

代码:

 
 /* 
  *   syscall -   system call request C handler 
  * 
  *   A system call is essentially treated as a trap. 
  */ 
 void 
 syscall(frame) 
    struct trapframe frame; 
 { 
    caddr_t params; 
    struct sysent *callp; 
    struct thread *td = curthread; 
    struct proc *p = td->td_proc; 
    register_t orig_tf_eflags; 
    u_int sticks; 
    int error; 
    int narg; 
    int args[8]; 
    u_int code; 
 
    /* 
     * note: PCPU_LAZY_INC() can only be used if we can afford 
     * occassional inaccuracy in the count. 
     */ 
    PCPU_LAZY_INC(cnt.v_syscall); 
 
 #ifdef DIAGNOSTIC 
    if (ISPL(frame.tf_cs) != SEL_UPL) { 
       mtx_lock(&Giant);   /* try to stabilize the system XXX */ 
       panic("syscall"); 
       /* NOT REACHED */ 
       mtx_unlock(&Giant); 
    } 
 #endif 
 
    sticks = td->td_sticks; 
    td->td_frame = &frame; 
    if (td->td_ucred != p->p_ucred) 
       cred_update_thread(td); 

如果进程的user credential发生了改变，更新线程的相应指针。

代码:

 
    if (p->p_flag & P_SA) 
       thread_user_enter(p, td); 

如果进程的线程模型采用scheduler activation，则需要通知用户态的线程manager
(FIXME)

代码:

 
 (sys/sys/proc.h) 
 #define   P_SA      0x08000   /* Using scheduler activations. */ 

代码:

 
    params = (caddr_t)frame.tf_esp + sizeof(int); 
    code = frame.tf_eax; 

params指向用户传递的系统调用参数。code指示是何种系统调用，后面还有描述。

代码:

 
    orig_tf_eflags = frame.tf_eflags; 
 
    if (p->p_sysent->sv_prepsyscall) { 
       /* 
        * The prep code is MP aware. 
        */ 
       (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params); 

如果该进程有自己的系统调用准备函数，则调用之。事实上，所谓的系统调用准备函数，
其作用应该就是对用户传进来的参数进行解释。如果没有准备函数，则内核做缺省处理，如下：

代码:

 
    } else { 
       /* 
        * Need to check if this is a 32 bit or 64 bit syscall. 
        * fuword is MP aware. 
        */ 
       if (code == SYS_syscall) { 
          /* 
           * Code is first argument, followed by actual args. 
           */ 
          code = fuword(params); 
          params += sizeof(int); 
       } else if (code == SYS___syscall) { 
          /* 
           * Like syscall, but code is a quad, so as to maintain 
           * quad alignment for the rest of the arguments. 
           */ 
          code = fuword(params); 
          params += sizeof(quad_t); 
       } 
    } 

如果该进程没有自己的系统调用准备函数，即缺省情况，则根据系统调用是32位还是64位，
得到相应的具体系统号，并相应调整指向用户参数的指针。

SYS_syscall对应32位方式，
SYS___syscall对应64位方式。

函数fuword()意为fetch user word，即从用户空间拷贝一个word到内核空间来。其定义在
sys/i386/i386/support.s中，其实现与copyin()类似，我们略过。

此时，具体的系统调用号已经在变量code中了。

代码:

 
     if (p->p_sysent->sv_mask) 
        code &= p->p_sysent->sv_mask; 

对系统调用号做一些调整和限制。

代码:

 
      if ( code >= p->p_sysent->sv_size) 
        callp = &p->p_sysent->sv_table[0]; 
      else 
        callp = &p->p_sysent->sv_table[_code]; 

得到系统调用的函数入口。

代码:

 
    narg = callp->sy_narg & SYF_ARGMASK; 

得到该系统调用的参数个数。

代码:

 
    /* 
     * copyin and the ktrsyscall()/ktrsysret() code is MP-aware 
     */ 
    if (params != NULL && narg != 0) 
       error = copyin(params, (caddr_t)args, 
           (u_int)(narg * sizeof(int))); 
    else 
       error = 0; 

将参数从用户态拷贝到内核态的args中。

代码:

        
 #ifdef KTRACE 
    if (KTRPOINT(td, KTR_SYSCALL)) 
       ktrsyscall(code, narg, args); 
 #endif 
 
    /* 
     * Try to run the syscall without Giant if the syscall 
     * is MP safe. 
     */ 
    if ((callp->sy_narg & SYF_MPSAFE) == 0) 
       mtx_lock(&Giant); 

如果该系统调用不是MP安全的，则获取全局锁。

代码:

 
    if (error == 0) { 
       td->td_retval[0] = 0; 
       td->td_retval[1] = frame.tf_edx; 
 
       STOPEVENT(p, S_SCE, narg); 
 
       PTRACESTOP_SC(p, td, S_PT_SCE); 
 
       error = (*callp->sy_call)(td, args); 
    } 

调用具体的系统调用。
这里，之所以要间接地使用一个系统调用函数表，是因为模拟其他操作系统的
需要。同一个系统调用在不同的操作系统里"系统调用号"是不同的，当运行其他
操作系统的应用程序时，因为其编译结果是用其他操作系统的"系统调用号"，
此时需要转换到相应的FreeBSD的"系统调用号"上来，使用系统调用函数表就可以
方便地作到这一点。

代码:

 
    switch (error) { 
    case 0: 
       frame.tf_eax = td->td_retval[0]; 
       frame.tf_edx = td->td_retval[1]; 
       frame.tf_eflags &= ~PSL_C; 
       break; 

Great，调用成功，设置返回值，并清除carry bit，用户态的libc要根据carry bit
判断系统调用是否成功。

代码:

 
    case ERESTART: 
       /* 
        * Reconstruct pc, assuming lcall $X,y is 7 bytes, 
        * int 0x80 is 2 bytes. We saved this in tf_err. 
        */ 
       frame.tf_eip -= frame.tf_err; 
       break; 

系统调用返回ERESTART，内核要尝试重新执行系统调用，因此需要将返回用户空间后的
%eip后退，具体后退几个字节，跟系统调用的进入方式有关，如果是通过int 0x80进入的，
由于int 0x80指令的长度为两个字节，因此回退2字节，如果是通过lcall $X,y方式进入
内核的，由于lcall $X,y指令的长度为7个字节，因此回退7字节。具体几个字节，在刚进入
时已经压到堆栈上了(前述pushl $2即是)。

代码:

 
    case EJUSTRETURN: 
       break; 
 
    default: 
        if (p->p_sysent->sv_errsize) { 
           if (error >= p->p_sysent->sv_errsize) 
               error = -1;   /* XXX */ 
             else 
               error = p->p_sysent->sv_errtbl[error]; 
       } 
       frame.tf_eax = error; 
       frame.tf_eflags |= PSL_C; 
       break; 
    } 

如果系统调用返回其他错误的话，则在进程的一个错误对应表中转换错误号。
并设置carry bit，以便libc知道。

代码:

 
    /* 
     * Release Giant if we previously set it. 
     */ 
    if ((callp->sy_narg & SYF_MPSAFE) == 0) 
       mtx_unlock(&Giant); 

释放全局锁。

代码:

 
    /* 
     * Traced syscall. 
     */ 
    if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { 
       frame.tf_eflags &= ~PSL_T; 
       trapsignal(td, SIGTRAP, 0); 
    } 

处理Traced系统调用。

代码:

 
    /* 
     * Handle reschedule and other end-of-syscall issues 
     */ 
    userret(td, &frame, sticks); 

做一些调度处理等，后面另分析。

代码:

 
 #ifdef KTRACE 
    if (KTRPOINT(td, KTR_SYSRET)) 
       ktrsysret(code, error, td->td_retval[0]); 
 #endif 
 
    /* 
     * This works because errno is findable through the 
     * register set.  If we ever support an emulation where this 
     * is not the case, this code will need to be revisited. 
     */ 
    STOPEVENT(p, S_SCX, code); 
 
    PTRACESTOP_SC(p, td, S_PT_SCX); 
 
 #ifdef DIAGNOSTIC 
    cred_free_thread(td); 
 #endif 
    WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", 
        (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[_code] : "???"); 
    mtx_assert(&sched_lock, MA_NOTOWNED); 
    mtx_assert(&Giant, MA_NOTOWNED); 
 } 

4, userret()函数
-----------------

简要地看一下userret()函数。

代码:

 
 /* 
  * Define the code needed before returning to user mode, for 
  * trap and syscall. 
  * 
  * MPSAFE 
  */ 
 void 
 userret(td, frame, oticks) 
    struct thread *td; 
    struct trapframe *frame; 
    u_int oticks; 
 { 
    struct proc *p = td->td_proc; 
 
    CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, 
             p->p_comm); 
 #ifdef INVARIANTS 
    /* Check that we called signotify() enough. */ 
    PROC_LOCK(p); 
    mtx_lock_spin(&sched_lock); 
    if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 || 
        (td->td_flags & TDF_ASTPENDING) == 0)) 
       printf("failed to set signal flags properly for ast()\n"); 
    mtx_unlock_spin(&sched_lock); 
    PROC_UNLOCK(p); 
 #endif 
 
    /* 
     * Let the scheduler adjust our priority etc. 
     */ 
    sched_userret(td); 

调度器处理。

代码:

 
    /* 
     * We need to check to see if we have to exit or wait due to a 
     * single threading requirement or some other STOP condition. 
     * Don't bother doing all the work if the stop bits are not set 
     * at this time.. If we miss it, we miss it.. no big deal. 
     */ 
    if (P_SHOULDSTOP(p)) { 
       PROC_LOCK(p); 
       thread_suspend_check(0);   /* Can suspend or kill */ 
       PROC_UNLOCK(p); 
    } 

是否需要停住？系统的某些时候只允许单个线程运行。

代码:

 
    /* 
     * Do special thread processing, e.g. upcall tweaking and such. 
     */ 
    if (p->p_flag & P_SA) { 
       thread_userret(td, frame); 
    } 

又是scheduler activation的东西，通知用户态的thread manager。
(FIXME)

代码:

 
    /* 
     * Charge system time if profiling. 
     */ 
    if (p->p_flag & P_PROFIL) { 
       quad_t ticks; 
 
       mtx_lock_spin(&sched_lock); 
       ticks = td->td_sticks - oticks; 
       mtx_unlock_spin(&sched_lock); 
       addupc_task(td, TRAPF_PC(frame), (u_int)ticks * psratio); 
    } 
 } 

最后是profiling的东西。

最后进行编辑的是 wheelz on Tue 2004-05-11 09:42:44, 总计第 1 次编辑

本文链接：https://blog.csdn.net/Firas/article/details/10177677

原作者删帖不实内容删帖广告或垃圾文章投诉

智能推荐

使用nginx解决浏览器跨域问题_nginx不停的xhr-程序员宅基地

文章浏览阅读1k次。通过使用ajax方法跨域请求是浏览器所不允许的，浏览器出于安全考虑是禁止的。警告信息如下：不过jQuery对跨域问题也有解决方案，使用jsonp的方式解决，方法如下：$.ajax({ async:false, url: 'http://www.mysite.com/demo.do', // 跨域URL ty..._nginx不停的xhr

在 Oracle 中配置 extproc 以访问 ST_Geometry-程序员宅基地

文章浏览阅读2k次。关于在 Oracle 中配置 extproc 以访问 ST_Geometry，也就是我们所说的使用空间SQL 的方法，官方文档链接如下。http://desktop.arcgis.com/zh-cn/arcmap/latest/manage-data/gdbs-in-oracle/configure-oracle-extproc.htm其实简单总结一下，主要就分为以下几个步骤。..._extproc

Linux C++ gbk转为utf-8_linux c++ gbk->utf8-程序员宅基地

文章浏览阅读1.5w次。linux下没有上面的两个函数，需要使用函数 mbstowcs和wcstombsmbstowcs将多字节编码转换为宽字节编码wcstombs将宽字节编码转换为多字节编码这两个函数，转换过程中受到系统编码类型的影响，需要通过设置来设定转换前和转换后的编码类型。通过函数setlocale进行系统编码的设置。linux下输入命名locale -a查看系统支持的编码_linux c++ gbk->utf8

IMP-00009: 导出文件异常结束-程序员宅基地

文章浏览阅读750次。今天准备从生产库向测试库进行数据导入，结果在imp导入的时候遇到“ IMP-00009:导出文件异常结束” 错误，google一下，发现可能有如下原因导致imp的数据太大，没有写buffer和commit两个数据库字符集不同从低版本exp的dmp文件，向高版本imp导出的dmp文件出错传输dmp文件时，文件损坏解决办法：imp时指定..._imp-00009导出文件异常结束

python程序员需要深入掌握的技能_Python用数据说明程序员需要掌握的技能-程序员宅基地

文章浏览阅读143次。当下是一个大数据的时代，各个行业都离不开数据的支持。因此，网络爬虫就应运而生。网络爬虫当下最为火热的是Python，Python开发爬虫相对简单，而且功能库相当完善，力压众多开发语言。本次教程我们爬取前程无忧的招聘信息来分析Python程序员需要掌握那些编程技术。首先在谷歌浏览器打开前程无忧的首页，按F12打开浏览器的开发者工具。浏览器开发者工具是用于捕捉网站的请求信息，通过分析请求信息可以了解请..._初级python程序员能力要求

Spring @Service生成bean名称的规则（当类的名字是以两个或以上的大写字母开头的话，bean的名字会与类名保持一致）_@service beanname-程序员宅基地

文章浏览阅读7.6k次，点赞2次，收藏6次。@Service标注的bean，类名：ABDemoService查看源码后发现，原来是经过一个特殊处理：当类的名字是以两个或以上的大写字母开头的话，bean的名字会与类名保持一致public class AnnotationBeanNameGenerator implements BeanNameGenerator { private static final String C..._@service beanname

随便推点

二叉树的各种创建方法_二叉树的建立-程序员宅基地

文章浏览阅读6.9w次，点赞73次，收藏463次。1.前序创建#include<stdio.h>#include<string.h>#include<stdlib.h>#include<malloc.h>#include<iostream>#include<stack>#include<queue>using namespace std;typed_二叉树的建立

解决asp.net导出excel时中文文件名乱码_asp.net utf8 导出中文字符乱码-程序员宅基地

文章浏览阅读7.1k次。在Asp.net上使用Excel导出功能，如果文件名出现中文，便会以乱码视之。解决方法： fileName = HttpUtility.UrlEncode(fileName, System.Text.Encoding.UTF8);_asp.net utf8 导出中文字符乱码

笔记-编译原理-实验一-词法分析器设计_对pl/0作以下修改扩充。增加单词-程序员宅基地

文章浏览阅读2.1k次，点赞4次，收藏23次。第一次实验词法分析实验报告设计思想词法分析的主要任务是根据文法的词汇表以及对应约定的编码进行一定的识别，找出文件中所有的合法的单词，并给出一定的信息作为最后的结果，用于后续语法分析程序的使用；本实验针对 PL/0 语言的文法、词汇表编写一个词法分析程序，对于每个单词根据词汇表输出： (单词种类, 单词的值) 二元对。词汇表：种别编码单词符号助记符0beginb..._对pl/0作以下修改扩充。增加单词

android adb shell 权限,android adb shell权限被拒绝-程序员宅基地

文章浏览阅读773次。我在使用adb.exe时遇到了麻烦.我想使用与bash相同的adb.exe shell提示符,所以我决定更改默认的bash二进制文件(当然二进制文件是交叉编译的,一切都很完美)更改bash二进制文件遵循以下顺序> adb remount> adb push bash / system / bin /> adb shell> cd / system / bin> chm..._adb shell mv 权限

投影仪-相机标定_相机-投影仪标定-程序员宅基地

文章浏览阅读6.8k次，点赞12次，收藏125次。1. 单目相机标定引言相机标定已经研究多年，标定的算法可以分为基于摄影测量的标定和自标定。其中，应用最为广泛的还是张正友标定法。这是一种简单灵活、高鲁棒性、低成本的相机标定算法。仅需要一台相机和一块平面标定板构建相机标定系统，在标定过程中，相机拍摄多个角度下（至少两个角度，推荐10~20个角度）的标定板图像（相机和标定板都可以移动），即可对相机的内外参数进行标定。下面介绍张氏标定法（以下也这么称呼）的原理。原理相机模型和单应矩阵相机标定，就是对相机的内外参数进行计算的过程，从而得到物体到图像的投影_相机-投影仪标定

Wayland架构、渲染、硬件支持-程序员宅基地

文章浏览阅读2.2k次。文章目录Wayland 架构Wayland 渲染Wayland的硬件支持简述：　翻译一篇关于和 wayland 有关的技术文章, 其英文标题为Wayland Architecture .Wayland 架构若是想要更好的理解 Wayland 架构及其与 X (X11 or X Window System) 结构；一种很好的方法是将事件从输入设备就开始跟踪, 查看期间所有的屏幕上出现的变化。这就是我们现在对 X 的理解。内核是从一个输入设备中获取一个事件，并通过 evdev 输入_wayland