framework watchdog

问题背景

搞稳定性以来，知道framework有个watchdog，监听主要进程是否卡死。
如果卡住60秒就会杀死所在进程也就是system_server，上层重启。
搜索关键log “WATCHDOG KILLING SYSTEM PROCESS”即可。

那么它到底是如何判定系统卡住了，这里简单了来看一下watchdog的代码了解一下

watchdog

在源码里添加一些log，方便分析：

    private void run() {
        boolean waitedHalf = false;

        while (true) {
            List<HandlerChecker> blockedCheckers = Collections.emptyList();
            String subject = "";
            boolean allowRestart = true;
            int debuggerWasConnected = 0;
            boolean doWaitedHalfDump = false;
            // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
            // We store the current value to use a consistent value for all handlers.
            final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
            final long checkIntervalMillis = watchdogTimeoutMillis / 2;
            //watchdogTimeoutMillis 60000  
            //checkIntervalMillis 30000
            Slog.i(TAG, "watchdog run watchdogTimeoutMillis: " + watchdogTimeoutMillis+" checkIntervalMillis: "+checkIntervalMillis);
            final ArrayList<Integer> pids;
            ...
            synchronized (mLock) {
                long timeout = checkIntervalMillis;
                //timeout 30000
                Slog.d(TAG, "watchdog run timeout 111 : " + timeout);
                // Make sure we (re)spin the checkers that have become idle within
                // this wait-and-check interval
                for (int i=0; i<mHandlerCheckers.size(); i++) {
                    HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
                    // We pick the watchdog to apply every time we reschedule the checkers. The
                    // default timeout might have changed since the last run.
                    //scheduleCheckLocked---记录startTime
                    hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
                            .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
                }

                if (debuggerWasConnected > 0) {
                    debuggerWasConnected--;
                }

                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
                // wait while asleep. If the device is asleep then the thing that we are waiting
                // to timeout on is asleep as well and won't have a chance to run, causing a false
                // positive on when to kill things.
                long start = SystemClock.uptimeMillis();
                //start 的是从系统启动开始以来的时间
                Slog.i(TAG, "watchdog run start: " + start);
                while (timeout > 0) {
                    if (Debug.isDebuggerConnected()) {
                        debuggerWasConnected = 2;
                    }
                    try {
                    	//wait 30000毫秒
                        mLock.wait(timeout);
                        // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
                    } catch (InterruptedException e) {
                        Log.wtf(TAG, e);
                    }
                    if (Debug.isDebuggerConnected()) {
                        debuggerWasConnected = 2;
                    }
                    //30000 - 时间差
                    //如果程序完全不耗时，因为上面wait 30000，这里趋近于0
                    //但是代码运行总是需要时间的，所以timeout打印出来是-1、-2毫秒
                    timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
                    Slog.d(TAG, "watchdog run timeout 222 : " + timeout);
                }
				//获取状态，后面主要就靠这个状态判断是否要kill重启
                final int waitState = evaluateCheckerCompletionLocked();
                Slog.d(TAG, "watchdog run waitState : " + waitState);
                //30000毫秒内，检查完所有进程，直接continue进行下一轮检测
                if (waitState == COMPLETED) {
                    // The monitors have returned; reset
                    waitedHalf = false;
                    continue;
                } else if (waitState == WAITING) {
                    // still waiting but within their configured intervals; back off and recheck
                    ...
                    continue;
                } else if (waitState == WAITED_HALF) {
                	//waitedHalf最开始是false，第一次等待超时，进入后改为true
                	//第一次超时会进入，第二次不会重复进
                	//配合下面的doWaitedHalfDump 实现第一次超时不重启，第二次重启
                    if (!waitedHalf) {
                        Slog.i(TAG, "WAITED_HALF");
                        waitedHalf = true;
                        // We've waited half, but we'd need to do the stack trace dump w/o the lock.
                        blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
                        subject = describeCheckersLocked(blockedCheckers);
                        pids = new ArrayList<>(mInterestingJavaPids);
                        doWaitedHalfDump = true;
                    } else {
                        continue;
                    }
                } else {
                	//如果走到这里，说明要杀死pid了，系统即将重启
                    // something is overdue!
                    blockedCheckers = getCheckersWithStateLocked(OVERDUE);
                    subject = describeCheckersLocked(blockedCheckers);
                    allowRestart = mAllowRestart;
                    pids = new ArrayList<>(mInterestingJavaPids);
                }
            } // END synchronized (mLock)

            // If we got here, that means that the system is most likely hung.
            //
            // First collect stack traces from all threads of the system process.
            //
            // Then, if we reached the full timeout, kill this process so that the system will
            // restart. If we reached half of the timeout, just log some information and continue.
            logWatchog(doWaitedHalfDump, subject, pids);
			//doWaitedHalfDump 在第一次超时是赋值为true，保证log能打印，第二次才会真正的kill
            if (doWaitedHalfDump) {
                // We have waited for only half of the timeout, we continue to wait for the duration
                // of the full timeout before killing the process.
                continue;
            }

            IActivityController controller;
            synchronized (mLock) {
                controller = mController;
            }
            if (controller != null) {
                Slog.i(TAG, "Reporting stuck state to activity controller");
                try {
                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                    // 1 = keep waiting, -1 = kill system
                    int res = controller.systemNotResponding(subject);
                    if (res >= 0) {
                        Slog.i(TAG, "Activity controller requested to coninue to wait");
                        waitedHalf = false;
                        continue;
                    }
                } catch (RemoteException e) {
                }
            }
			//这个是判断是否连接了调试程序，断电时防止直接kill系统进程，正常不会走这里
            // Only kill the process if the debugger is not attached.
            if (Debug.isDebuggerConnected()) {
                debuggerWasConnected = 2;
            }
            if (debuggerWasConnected >= 2) {
                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
            } else if (debuggerWasConnected > 0) {
                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
            } else if (!allowRestart) {
                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
            } else {
                //这里就是kill系统进程的地方
                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
                WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
                Slog.w(TAG, "*** GOODBYE!");
                if (!Build.IS_USER && isCrashLoopFound()
                        && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
                    breakCrashLoop();
                }
                Process.killProcess(Process.myPid());
                System.exit(10);
            }

            waitedHalf = false;
        }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

整个方法有2个地方需要注意
如何判断系统是否卡住了、waitState如何赋值

1、hc.checker().scheduleCheckLocked

public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
    mWaitMaxMillis = handlerCheckerTimeoutMillis;
    if (mCompleted) {
        // Safe to update monitors in queue, Handler is not in the middle of work
        mMonitors.addAll(mMonitorQueue);
        mMonitorQueue.clear();
    }
    //监视队列没有东西，或者队列正在轮询，直接完成
    if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
            || (mPauseCount > 0)) {
        // Don't schedule until after resume OR
        // If the target looper has recently been polling, then
        // there is no reason to enqueue our checker on it since that
        // is as good as it not being deadlocked.  This avoid having
        // to do a context switch to check the thread. Note that we
        // only do this if we have no monitors since those would need to
        // be executed at this point.
        mCompleted = true;
        return;
    }
    //如果上次检查没有完成，不进行重复操作，直接return
    if (!mCompleted) {
        // we already have a check in flight, so no need
        return;
    }
	//修改标志位，表示检查没有完成
    mCompleted = false;
    mCurrentMonitor = null;
    //记录启动查询的时间
    mStartTimeMillis = SystemClock.uptimeMillis();
    //mWaitMaxMillis 60000
    Slog.i(TAG, "scheduleCheckLocked mWaitMaxMillis: " + mWaitMaxMillis);
    //mStartTimeMillis 从系统启动开始以来的时间
    Slog.i(TAG, "scheduleCheckLocked mStartTimeMillis: " + mStartTimeMillis);
    //发消息进行检查工作
    mHandler.postAtFrontOfQueue(this);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

mHandler.run

@Override
public void run() {
    // Once we get here, we ensure that mMonitors does not change even if we call
    // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
    // move them to mMonitors on the next schedule when mCompleted is true, at which
    // point we have completed execution of this method.
    final int size = mMonitors.size();
    //循环调用所有监视器的monitor方法
    for (int i = 0 ; i < size ; i++) {
        synchronized (mLock) {
            mCurrentMonitor = mMonitors.get(i);
        }
        Slog.i(TAG, "HandlerChecker mCurrentMonitor: " + mCurrentMonitor);
        mCurrentMonitor.monitor();
    }
	//检查完mCompleted改为true
    synchronized (mLock) {
        mCompleted = true;
        mCurrentMonitor = null;
    }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

//检测的log
从log可以看出，watchdog的run和mHandler.run不在同一个线程。

S022A1B  09-28 18:41:00.033  1132  1198 I Watchdog: watchdog run watchdogTimeoutMillis: 60000 checkIntervalMillis: 30000
S022A1D  09-28 18:41:00.034  1132  1198 D Watchdog: watchdog run timeout 111 : 30000
S022A1E  09-28 18:41:00.034  1132  1198 I Watchdog: scheduleCheckLocked mWaitMaxMillis: 60000
S022A1F  09-28 18:41:00.035  1132  1198 I Watchdog: scheduleCheckLocked mStartTimeMillis: 4496639
S022A20  09-28 18:41:00.036  1132  1198 I Watchdog: watchdog run start: 4496641
S022A21  09-28 18:41:00.036  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.Watchdog$BinderThreadMonitor@75334b9
S022A22  09-28 18:41:00.037  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.am.UnisocActivityManagerServiceImpl@7dfd5fd
S022A24  09-28 18:41:00.038  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.power.PowerManagerService@877361c
S022A25  09-28 18:41:00.038  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.wm.UnisocWindowManagerService@ede477e
S022A26  09-28 18:41:00.039  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.input.InputManagerService@2e6c225
S022A27  09-28 18:41:00.041  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.StorageManagerService@9061ffa
S022A28  09-28 18:41:00.043  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.MediaSessionService@efcfeeb
S022A29  09-28 18:41:00.043  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.MediaRouterService@c265448
S022A2A  09-28 18:41:00.044  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.projection.MediaProjectionManagerService@1b002e1
S022A2B  09-28 18:41:00.044  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.am.BatteryStatsService@3d7f806
S022FCD  09-28 18:41:30.037  1132  1198 D Watchdog: watchdog run timeout 222 : 0
S022FCE  09-28 18:41:30.037  1132  1198 D Watchdog: watchdog run waitState : 0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

monitor里面啥也没有。只是获取一下同步锁，看看service有没有在运行其它任务。
如果长期运行其它任务，就无法获取到mLock。

frameworks\base\services\core\java\com\android\server\power\PowerManagerService.java

    public void monitor() {
        // Grab and release lock for watchdog monitor to detect deadlocks.
        synchronized (mLock) {
        }
    }
1
2
3
4
5

2、evaluateCheckerCompletionLocked

    private int evaluateCheckerCompletionLocked() {
        int state = COMPLETED;
        for (int i=0; i<mHandlerCheckers.size(); i++) {
            HandlerChecker hc = mHandlerCheckers.get(i).checker();
            state = Math.max(state, hc.getCompletionStateLocked());
        }
        return state;
    }
public int getCompletionStateLocked() {
    Slog.i(TAG, "getCompletionStateLocked mCompleted: " + mCompleted);
    //上面的检测如果在30秒内完成，mCompleted就是true
    if (mCompleted) {
        return COMPLETED;
    } else {
    	//如果没完成，则根据卡顿的时间来计算状态
    	//因为上面wait了30000毫秒，所以latency一定是大于30000毫秒的
        long latency = SystemClock.uptimeMillis() - mStartTimeMillis;
        Slog.i(TAG, "getCompletionStateLocked latency: " + latency);
        if (latency < mWaitMaxMillis / 2) {
            return WAITING;//这里理论上不会走进来
        } else if (latency < mWaitMaxMillis) {
            return WAITED_HALF;//第一次卡顿
        }
    }
    return OVERDUE;//第二次卡顿
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

相关阅读:
C# 发送邮件
 实验篇(7.2) 10. 扩充物理实验环境 ❀ 远程访问
 数据分析基础之《jupyter notebook工具》
GPT访问跨域如何解决呢？
vscode代码拼写错误检测插件
 Django项目配置
 【深度学习】如果我年少有为，会垃圾分类
 剪贴板劫持--PasteJacker的安装
 丹麦商标申请途径商标注册申请所需文件丹麦商标注册流程
 【Java刷题进阶】基础入门篇⑧
原文地址：https://blog.csdn.net/a396604593/article/details/133390501