1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static com.android.server.pm.PackageManagerServiceUtils.logCriticalInfo;
20 
21 import android.content.ContentResolver;
22 import android.content.Context;
23 import android.os.Build;
24 import android.os.Environment;
25 import android.os.FileUtils;
26 import android.os.RecoverySystem;
27 import android.os.SystemClock;
28 import android.os.SystemProperties;
29 import android.os.UserHandle;
30 import android.provider.Settings;
31 import android.text.format.DateUtils;
32 import android.util.ExceptionUtils;
33 import android.util.Log;
34 import android.util.MathUtils;
35 import android.util.Slog;
36 import android.util.SparseArray;
37 import android.util.StatsLog;
38 
39 import com.android.internal.annotations.VisibleForTesting;
40 import com.android.internal.util.ArrayUtils;
41 import com.android.server.am.SettingsToPropertiesMapper;
42 import com.android.server.utils.FlagNamespaceUtils;
43 
44 import java.io.File;
45 import java.util.Arrays;
46 
47 /**
48  * Utilities to help rescue the system from crash loops. Callers are expected to
49  * report boot events and persistent app crashes, and if they happen frequently
50  * enough this class will slowly escalate through several rescue operations
51  * before finally rebooting and prompting the user if they want to wipe data as
52  * a last resort.
53  *
54  * @hide
55  */
56 public class RescueParty {
57     @VisibleForTesting
58     static final String PROP_ENABLE_RESCUE = "persist.sys.enable_rescue";
59     @VisibleForTesting
60     static final int TRIGGER_COUNT = 5;
61     @VisibleForTesting
62     static final String PROP_RESCUE_LEVEL = "sys.rescue_level";
63     @VisibleForTesting
64     static final int LEVEL_NONE = 0;
65     @VisibleForTesting
66     static final int LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS = 1;
67     @VisibleForTesting
68     static final int LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES = 2;
69     @VisibleForTesting
70     static final int LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS = 3;
71     @VisibleForTesting
72     static final int LEVEL_FACTORY_RESET = 4;
73     @VisibleForTesting
74     static final String PROP_RESCUE_BOOT_COUNT = "sys.rescue_boot_count";
75     /**
76      * The boot trigger window size must always be greater than Watchdog's deadlock timeout
77      * {@link Watchdog#DEFAULT_TIMEOUT}.
78      */
79     @VisibleForTesting
80     static final long BOOT_TRIGGER_WINDOW_MILLIS = 600 * DateUtils.SECOND_IN_MILLIS;
81     @VisibleForTesting
82     static final long PERSISTENT_APP_CRASH_TRIGGER_WINDOW_MILLIS = 30 * DateUtils.SECOND_IN_MILLIS;
83     @VisibleForTesting
84     static final String TAG = "RescueParty";
85 
86     private static final String PROP_DISABLE_RESCUE = "persist.sys.disable_rescue";
87     private static final String PROP_RESCUE_BOOT_START = "sys.rescue_boot_start";
88     private static final String PROP_VIRTUAL_DEVICE = "ro.hardware.virtual_device";
89 
90     /** Threshold for boot loops */
91     private static final Threshold sBoot = new BootThreshold();
92     /** Threshold for app crash loops */
93     private static SparseArray<Threshold> sApps = new SparseArray<>();
94 
isDisabled()95     private static boolean isDisabled() {
96         // Check if we're explicitly enabled for testing
97         if (SystemProperties.getBoolean(PROP_ENABLE_RESCUE, false)) {
98             return false;
99         }
100 
101         // We're disabled on all engineering devices
102         if (Build.IS_ENG) {
103             Slog.v(TAG, "Disabled because of eng build");
104             return true;
105         }
106 
107         // We're disabled on userdebug devices connected over USB, since that's
108         // a decent signal that someone is actively trying to debug the device,
109         // or that it's in a lab environment.
110         if (Build.IS_USERDEBUG && isUsbActive()) {
111             Slog.v(TAG, "Disabled because of active USB connection");
112             return true;
113         }
114 
115         // One last-ditch check
116         if (SystemProperties.getBoolean(PROP_DISABLE_RESCUE, false)) {
117             Slog.v(TAG, "Disabled because of manual property");
118             return true;
119         }
120 
121         return false;
122     }
123 
124     /**
125      * Take note of a boot event. If we notice too many of these events
126      * happening in rapid succession, we'll send out a rescue party.
127      */
noteBoot(Context context)128     public static void noteBoot(Context context) {
129         if (isDisabled()) return;
130         if (sBoot.incrementAndTest()) {
131             sBoot.reset();
132             incrementRescueLevel(sBoot.uid);
133             executeRescueLevel(context);
134         }
135     }
136 
137     /**
138      * Take note of a persistent app or apex module crash. If we notice too many of these
139      * events happening in rapid succession, we'll send out a rescue party.
140      */
noteAppCrash(Context context, int uid)141     public static void noteAppCrash(Context context, int uid) {
142         if (isDisabled()) return;
143         Threshold t = sApps.get(uid);
144         if (t == null) {
145             t = new AppThreshold(uid);
146             sApps.put(uid, t);
147         }
148         if (t.incrementAndTest()) {
149             t.reset();
150             incrementRescueLevel(t.uid);
151             executeRescueLevel(context);
152         }
153     }
154 
155     /**
156      * Check if we're currently attempting to reboot for a factory reset.
157      */
isAttemptingFactoryReset()158     public static boolean isAttemptingFactoryReset() {
159         return SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE) == LEVEL_FACTORY_RESET;
160     }
161 
162     /**
163      * Called when {@code SettingsProvider} has been published, which is a good
164      * opportunity to reset any settings depending on our rescue level.
165      */
onSettingsProviderPublished(Context context)166     public static void onSettingsProviderPublished(Context context) {
167         handleNativeRescuePartyResets();
168         executeRescueLevel(context);
169     }
170 
171     @VisibleForTesting
resetAllThresholds()172     static void resetAllThresholds() {
173         sBoot.reset();
174 
175         for (int i = 0; i < sApps.size(); i++) {
176             Threshold appThreshold = sApps.get(sApps.keyAt(i));
177             appThreshold.reset();
178         }
179     }
180 
181     @VisibleForTesting
getElapsedRealtime()182     static long getElapsedRealtime() {
183         return SystemClock.elapsedRealtime();
184     }
185 
handleNativeRescuePartyResets()186     private static void handleNativeRescuePartyResets() {
187         if (SettingsToPropertiesMapper.isNativeFlagsResetPerformed()) {
188             FlagNamespaceUtils.resetDeviceConfig(Settings.RESET_MODE_TRUSTED_DEFAULTS,
189                     Arrays.asList(SettingsToPropertiesMapper.getResetNativeCategories()));
190         }
191     }
192 
193     /**
194      * Escalate to the next rescue level. After incrementing the level you'll
195      * probably want to call {@link #executeRescueLevel(Context)}.
196      */
incrementRescueLevel(int triggerUid)197     private static void incrementRescueLevel(int triggerUid) {
198         final int level = MathUtils.constrain(
199                 SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE) + 1,
200                 LEVEL_NONE, LEVEL_FACTORY_RESET);
201         SystemProperties.set(PROP_RESCUE_LEVEL, Integer.toString(level));
202 
203         EventLogTags.writeRescueLevel(level, triggerUid);
204         logCriticalInfo(Log.WARN, "Incremented rescue level to "
205                 + levelToString(level) + " triggered by UID " + triggerUid);
206     }
207 
executeRescueLevel(Context context)208     private static void executeRescueLevel(Context context) {
209         final int level = SystemProperties.getInt(PROP_RESCUE_LEVEL, LEVEL_NONE);
210         if (level == LEVEL_NONE) return;
211 
212         Slog.w(TAG, "Attempting rescue level " + levelToString(level));
213         try {
214             executeRescueLevelInternal(context, level);
215             EventLogTags.writeRescueSuccess(level);
216             logCriticalInfo(Log.DEBUG,
217                     "Finished rescue level " + levelToString(level));
218         } catch (Throwable t) {
219             final String msg = ExceptionUtils.getCompleteMessage(t);
220             EventLogTags.writeRescueFailure(level, msg);
221             logCriticalInfo(Log.ERROR,
222                     "Failed rescue level " + levelToString(level) + ": " + msg);
223         }
224     }
225 
executeRescueLevelInternal(Context context, int level)226     private static void executeRescueLevelInternal(Context context, int level) throws Exception {
227         StatsLog.write(StatsLog.RESCUE_PARTY_RESET_REPORTED, level);
228         switch (level) {
229             case LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS:
230                 resetAllSettings(context, Settings.RESET_MODE_UNTRUSTED_DEFAULTS);
231                 break;
232             case LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES:
233                 resetAllSettings(context, Settings.RESET_MODE_UNTRUSTED_CHANGES);
234                 break;
235             case LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS:
236                 resetAllSettings(context, Settings.RESET_MODE_TRUSTED_DEFAULTS);
237                 break;
238             case LEVEL_FACTORY_RESET:
239                 RecoverySystem.rebootPromptAndWipeUserData(context, TAG);
240                 break;
241         }
242         FlagNamespaceUtils.addToKnownResetNamespaces(
243                 FlagNamespaceUtils.NAMESPACE_NO_PACKAGE);
244     }
245 
resetAllSettings(Context context, int mode)246     private static void resetAllSettings(Context context, int mode) throws Exception {
247         // Try our best to reset all settings possible, and once finished
248         // rethrow any exception that we encountered
249         Exception res = null;
250         final ContentResolver resolver = context.getContentResolver();
251         try {
252             FlagNamespaceUtils.resetDeviceConfig(mode);
253         } catch (Exception e) {
254             res = new RuntimeException("Failed to reset config settings", e);
255         }
256         try {
257             Settings.Global.resetToDefaultsAsUser(resolver, null, mode, UserHandle.USER_SYSTEM);
258         } catch (Exception e) {
259             res = new RuntimeException("Failed to reset global settings", e);
260         }
261         for (int userId : getAllUserIds()) {
262             try {
263                 Settings.Secure.resetToDefaultsAsUser(resolver, null, mode, userId);
264             } catch (Exception e) {
265                 res = new RuntimeException("Failed to reset secure settings for " + userId, e);
266             }
267         }
268         if (res != null) {
269             throw res;
270         }
271     }
272 
273     /**
274      * Threshold that can be triggered if a number of events occur within a
275      * window of time.
276      */
277     private abstract static class Threshold {
getCount()278         public abstract int getCount();
setCount(int count)279         public abstract void setCount(int count);
getStart()280         public abstract long getStart();
setStart(long start)281         public abstract void setStart(long start);
282 
283         private final int uid;
284         private final int triggerCount;
285         private final long triggerWindow;
286 
Threshold(int uid, int triggerCount, long triggerWindow)287         public Threshold(int uid, int triggerCount, long triggerWindow) {
288             this.uid = uid;
289             this.triggerCount = triggerCount;
290             this.triggerWindow = triggerWindow;
291         }
292 
reset()293         public void reset() {
294             setCount(0);
295             setStart(0);
296         }
297 
298         /**
299          * @return if this threshold has been triggered
300          */
incrementAndTest()301         public boolean incrementAndTest() {
302             final long now = getElapsedRealtime();
303             final long window = now - getStart();
304             if (window > triggerWindow) {
305                 setCount(1);
306                 setStart(now);
307                 return false;
308             } else {
309                 int count = getCount() + 1;
310                 setCount(count);
311                 EventLogTags.writeRescueNote(uid, count, window);
312                 Slog.w(TAG, "Noticed " + count + " events for UID " + uid + " in last "
313                         + (window / 1000) + " sec");
314                 return (count >= triggerCount);
315             }
316         }
317     }
318 
319     /**
320      * Specialization of {@link Threshold} for monitoring boot events. It stores
321      * counters in system properties for robustness.
322      */
323     private static class BootThreshold extends Threshold {
BootThreshold()324         public BootThreshold() {
325             // We're interested in TRIGGER_COUNT events in any
326             // BOOT_TRIGGER_WINDOW_MILLIS second period; this window is super relaxed because
327             // booting can take a long time if forced to dexopt things.
328             super(android.os.Process.ROOT_UID, TRIGGER_COUNT, BOOT_TRIGGER_WINDOW_MILLIS);
329         }
330 
331         @Override
getCount()332         public int getCount() {
333             return SystemProperties.getInt(PROP_RESCUE_BOOT_COUNT, 0);
334         }
335 
336         @Override
setCount(int count)337         public void setCount(int count) {
338             SystemProperties.set(PROP_RESCUE_BOOT_COUNT, Integer.toString(count));
339         }
340 
341         @Override
getStart()342         public long getStart() {
343             return SystemProperties.getLong(PROP_RESCUE_BOOT_START, 0);
344         }
345 
346         @Override
setStart(long start)347         public void setStart(long start) {
348             SystemProperties.set(PROP_RESCUE_BOOT_START, Long.toString(start));
349         }
350     }
351 
352     /**
353      * Specialization of {@link Threshold} for monitoring app crashes. It stores
354      * counters in memory.
355      */
356     private static class AppThreshold extends Threshold {
357         private int count;
358         private long start;
359 
AppThreshold(int uid)360         public AppThreshold(int uid) {
361             // We're interested in TRIGGER_COUNT events in any
362             // PERSISTENT_APP_CRASH_TRIGGER_WINDOW_MILLIS second period; apps crash pretty quickly
363             // so we can keep a tight leash on them.
364             super(uid, TRIGGER_COUNT, PERSISTENT_APP_CRASH_TRIGGER_WINDOW_MILLIS);
365         }
366 
getCount()367         @Override public int getCount() { return count; }
setCount(int count)368         @Override public void setCount(int count) { this.count = count; }
getStart()369         @Override public long getStart() { return start; }
setStart(long start)370         @Override public void setStart(long start) { this.start = start; }
371     }
372 
getAllUserIds()373     private static int[] getAllUserIds() {
374         int[] userIds = { UserHandle.USER_SYSTEM };
375         try {
376             for (File file : FileUtils.listFilesOrEmpty(Environment.getDataSystemDeDirectory())) {
377                 try {
378                     final int userId = Integer.parseInt(file.getName());
379                     if (userId != UserHandle.USER_SYSTEM) {
380                         userIds = ArrayUtils.appendInt(userIds, userId);
381                     }
382                 } catch (NumberFormatException ignored) {
383                 }
384             }
385         } catch (Throwable t) {
386             Slog.w(TAG, "Trouble discovering users", t);
387         }
388         return userIds;
389     }
390 
391     /**
392      * Hacky test to check if the device has an active USB connection, which is
393      * a good proxy for someone doing local development work.
394      */
isUsbActive()395     private static boolean isUsbActive() {
396         if (SystemProperties.getBoolean(PROP_VIRTUAL_DEVICE, false)) {
397             Slog.v(TAG, "Assuming virtual device is connected over USB");
398             return true;
399         }
400         try {
401             final String state = FileUtils
402                     .readTextFile(new File("/sys/class/android_usb/android0/state"), 128, "");
403             return "CONFIGURED".equals(state.trim());
404         } catch (Throwable t) {
405             Slog.w(TAG, "Failed to determine if device was on USB", t);
406             return false;
407         }
408     }
409 
levelToString(int level)410     private static String levelToString(int level) {
411         switch (level) {
412             case LEVEL_NONE: return "NONE";
413             case LEVEL_RESET_SETTINGS_UNTRUSTED_DEFAULTS: return "RESET_SETTINGS_UNTRUSTED_DEFAULTS";
414             case LEVEL_RESET_SETTINGS_UNTRUSTED_CHANGES: return "RESET_SETTINGS_UNTRUSTED_CHANGES";
415             case LEVEL_RESET_SETTINGS_TRUSTED_DEFAULTS: return "RESET_SETTINGS_TRUSTED_DEFAULTS";
416             case LEVEL_FACTORY_RESET: return "FACTORY_RESET";
417             default: return Integer.toString(level);
418         }
419     }
420 }
421