1 /* 2 * Copyright 2017, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include "config/ConfigKey.h" 19 #include "statslog.h" 20 21 #include <gtest/gtest_prod.h> 22 #include <log/log_time.h> 23 #include <list> 24 #include <mutex> 25 #include <string> 26 #include <vector> 27 #include <unordered_map> 28 29 namespace android { 30 namespace os { 31 namespace statsd { 32 33 struct ConfigStats { 34 int32_t uid; 35 int64_t id; 36 int32_t creation_time_sec; 37 int32_t deletion_time_sec = 0; 38 int32_t reset_time_sec = 0; 39 int32_t metric_count; 40 int32_t condition_count; 41 int32_t matcher_count; 42 int32_t alert_count; 43 bool is_valid; 44 45 std::list<int32_t> broadcast_sent_time_sec; 46 47 // Times at which this config is activated. 48 std::list<int32_t> activation_time_sec; 49 50 // Times at which this config is deactivated. 51 std::list<int32_t> deactivation_time_sec; 52 53 std::list<int32_t> data_drop_time_sec; 54 // Number of bytes dropped at corresponding time. 55 std::list<int64_t> data_drop_bytes; 56 std::list<std::pair<int32_t, int64_t>> dump_report_stats; 57 58 // Stores how many times a matcher have been matched. The map size is capped by kMaxConfigCount. 59 std::map<const int64_t, int> matcher_stats; 60 61 // Stores the number of output tuple of condition trackers when it's bigger than 62 // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1, 63 // it means some data has been dropped. The map size is capped by kMaxConfigCount. 64 std::map<const int64_t, int> condition_stats; 65 66 // Stores the number of output tuple of metric producers when it's bigger than 67 // kDimensionKeySizeSoftLimit. When you see the number is kDimensionKeySizeHardLimit +1, 68 // it means some data has been dropped. The map size is capped by kMaxConfigCount. 69 std::map<const int64_t, int> metric_stats; 70 71 // Stores the max number of output tuple of dimensions in condition across dimensions in what 72 // when it's bigger than kDimensionKeySizeSoftLimit. When you see the number is 73 // kDimensionKeySizeHardLimit +1, it means some data has been dropped. The map size is capped by 74 // kMaxConfigCount. 75 std::map<const int64_t, int> metric_dimension_in_condition_stats; 76 77 // Stores the number of times an anomaly detection alert has been declared. 78 // The map size is capped by kMaxConfigCount. 79 std::map<const int64_t, int> alert_stats; 80 81 // Stores the config ID for each sub-config used. 82 std::list<std::pair<const int64_t, const int32_t>> annotations; 83 }; 84 85 struct UidMapStats { 86 int32_t changes; 87 int32_t bytes_used; 88 int32_t dropped_changes; 89 int32_t deleted_apps = 0; 90 }; 91 92 // Keeps track of stats of statsd. 93 // Single instance shared across the process. All public methods are thread safe. 94 class StatsdStats { 95 public: 96 static StatsdStats& getInstance(); ~StatsdStats()97 ~StatsdStats(){}; 98 99 const static int kDimensionKeySizeSoftLimit = 500; 100 const static int kDimensionKeySizeHardLimit = 800; 101 102 // Per atom dimension key size limit 103 static const std::map<int, std::pair<size_t, size_t>> kAtomDimensionKeySizeLimitMap; 104 105 const static int kMaxConfigCountPerUid = 10; 106 const static int kMaxAlertCountPerConfig = 100; 107 const static int kMaxConditionCountPerConfig = 300; 108 const static int kMaxMetricCountPerConfig = 1000; 109 const static int kMaxMatcherCountPerConfig = 800; 110 111 // The max number of old config stats we keep. 112 const static int kMaxIceBoxSize = 20; 113 114 const static int kMaxLoggerErrors = 20; 115 116 const static int kMaxSystemServerRestarts = 20; 117 118 const static int kMaxTimestampCount = 20; 119 120 const static int kMaxLogSourceCount = 50; 121 122 // Max memory allowed for storing metrics per configuration. If this limit is exceeded, statsd 123 // drops the metrics data in memory. 124 static const size_t kMaxMetricsBytesPerConfig = 2 * 1024 * 1024; 125 126 // Soft memory limit per configuration. Once this limit is exceeded, we begin notifying the 127 // data subscriber that it's time to call getData. 128 static const size_t kBytesPerConfigTriggerGetData = 192 * 1024; 129 130 // Cap the UID map's memory usage to this. This should be fairly high since the UID information 131 // is critical for understanding the metrics. 132 const static size_t kMaxBytesUsedUidMap = 50 * 1024; 133 134 // The number of deleted apps that are stored in the uid map. 135 const static int kMaxDeletedAppsInUidMap = 100; 136 137 /* Minimum period between two broadcasts in nanoseconds. */ 138 static const int64_t kMinBroadcastPeriodNs = 60 * NS_PER_SEC; 139 140 /* Min period between two checks of byte size per config key in nanoseconds. */ 141 static const int64_t kMinByteSizeCheckPeriodNs = 60 * NS_PER_SEC; 142 143 /* Minimum period between two activation broadcasts in nanoseconds. */ 144 static const int64_t kMinActivationBroadcastPeriodNs = 10 * NS_PER_SEC; 145 146 // Maximum age (30 days) that files on disk can exist in seconds. 147 static const int kMaxAgeSecond = 60 * 60 * 24 * 30; 148 149 // Maximum age (2 days) that local history files on disk can exist in seconds. 150 static const int kMaxLocalHistoryAgeSecond = 60 * 60 * 24 * 2; 151 152 // Maximum number of files (1000) that can be in stats directory on disk. 153 static const int kMaxFileNumber = 1000; 154 155 // Maximum size of all files that can be written to stats directory on disk. 156 static const int kMaxFileSize = 50 * 1024 * 1024; 157 158 // How long to try to clear puller cache from last time 159 static const long kPullerCacheClearIntervalSec = 1; 160 161 // Max time to do a pull. 162 static const int64_t kPullMaxDelayNs = 10 * NS_PER_SEC; 163 164 // Maximum number of pushed atoms statsd stats will track above kMaxPushedAtomId. 165 static const int kMaxNonPlatformPushedAtoms = 100; 166 167 // Max platform atom tag number. 168 static const int32_t kMaxPlatformAtomTag = 100000; 169 170 // Vendor pulled atom start id. 171 static const int32_t kVendorPulledAtomStartTag = 150000; 172 173 // Beginning of range for timestamp truncation. 174 static const int32_t kTimestampTruncationStartTag = 300000; 175 176 // End of range for timestamp truncation. 177 static const int32_t kTimestampTruncationEndTag = 304999; 178 179 // Max accepted atom id. 180 static const int32_t kMaxAtomTag = 200000; 181 182 static const int64_t kInt64Max = 0x7fffffffffffffffLL; 183 184 /** 185 * Report a new config has been received and report the static stats about the config. 186 * 187 * The static stats include: the count of metrics, conditions, matchers, and alerts. 188 * If the config is not valid, this config stats will be put into icebox immediately. 189 */ 190 void noteConfigReceived(const ConfigKey& key, int metricsCount, int conditionsCount, 191 int matchersCount, int alertCount, 192 const std::list<std::pair<const int64_t, const int32_t>>& annotations, 193 bool isValid); 194 /** 195 * Report a config has been removed. 196 */ 197 void noteConfigRemoved(const ConfigKey& key); 198 /** 199 * Report a config has been reset when ttl expires. 200 */ 201 void noteConfigReset(const ConfigKey& key); 202 203 /** 204 * Report a broadcast has been sent to a config owner to collect the data. 205 */ 206 void noteBroadcastSent(const ConfigKey& key); 207 208 /** 209 * Report that a config has become activated or deactivated. 210 * This can be different from whether or not a broadcast is sent if the 211 * guardrail prevented the broadcast from being sent. 212 */ 213 void noteActiveStatusChanged(const ConfigKey& key, bool activate); 214 215 /** 216 * Report a config's metrics data has been dropped. 217 */ 218 void noteDataDropped(const ConfigKey& key, const size_t totalBytes); 219 220 /** 221 * Report metrics data report has been sent. 222 * 223 * The report may be requested via StatsManager API, or through adb cmd. 224 */ 225 void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes); 226 227 /** 228 * Report the size of output tuple of a condition. 229 * 230 * Note: only report when the condition has an output dimension, and the tuple 231 * count > kDimensionKeySizeSoftLimit. 232 * 233 * [key]: The config key that this condition belongs to. 234 * [id]: The id of the condition. 235 * [size]: The output tuple size. 236 */ 237 void noteConditionDimensionSize(const ConfigKey& key, const int64_t& id, int size); 238 239 /** 240 * Report the size of output tuple of a metric. 241 * 242 * Note: only report when the metric has an output dimension, and the tuple 243 * count > kDimensionKeySizeSoftLimit. 244 * 245 * [key]: The config key that this metric belongs to. 246 * [id]: The id of the metric. 247 * [size]: The output tuple size. 248 */ 249 void noteMetricDimensionSize(const ConfigKey& key, const int64_t& id, int size); 250 251 /** 252 * Report the max size of output tuple of dimension in condition across dimensions in what. 253 * 254 * Note: only report when the metric has an output dimension in condition, and the max tuple 255 * count > kDimensionKeySizeSoftLimit. 256 * 257 * [key]: The config key that this metric belongs to. 258 * [id]: The id of the metric. 259 * [size]: The output tuple size. 260 */ 261 void noteMetricDimensionInConditionSize(const ConfigKey& key, const int64_t& id, int size); 262 263 /** 264 * Report a matcher has been matched. 265 * 266 * [key]: The config key that this matcher belongs to. 267 * [id]: The id of the matcher. 268 */ 269 void noteMatcherMatched(const ConfigKey& key, const int64_t& id); 270 271 /** 272 * Report that an anomaly detection alert has been declared. 273 * 274 * [key]: The config key that this alert belongs to. 275 * [id]: The id of the alert. 276 */ 277 void noteAnomalyDeclared(const ConfigKey& key, const int64_t& id); 278 279 /** 280 * Report an atom event has been logged. 281 */ 282 void noteAtomLogged(int atomId, int32_t timeSec); 283 284 /** 285 * Report that statsd modified the anomaly alarm registered with StatsCompanionService. 286 */ 287 void noteRegisteredAnomalyAlarmChanged(); 288 289 /** 290 * Report that statsd modified the periodic alarm registered with StatsCompanionService. 291 */ 292 void noteRegisteredPeriodicAlarmChanged(); 293 294 /** 295 * Records the number of delta entries that are being dropped from the uid map. 296 */ 297 void noteUidMapDropped(int deltas); 298 299 /** 300 * Records that an app was deleted (from statsd's map). 301 */ 302 void noteUidMapAppDeletionDropped(); 303 304 /** 305 * Updates the number of changes currently stored in the uid map. 306 */ 307 void setUidMapChanges(int changes); 308 void setCurrentUidMapMemory(int bytes); 309 310 /* 311 * Updates minimum interval between pulls for an pulled atom. 312 */ 313 void updateMinPullIntervalSec(int pullAtomId, long intervalSec); 314 315 /* 316 * Notes an atom is pulled. 317 */ 318 void notePull(int pullAtomId); 319 320 /* 321 * Notes an atom is served from puller cache. 322 */ 323 void notePullFromCache(int pullAtomId); 324 325 /* 326 * Notify data error for pulled atom. 327 */ 328 void notePullDataError(int pullAtomId); 329 330 /* 331 * Records time for actual pulling, not including those served from cache and not including 332 * statsd processing delays. 333 */ 334 void notePullTime(int pullAtomId, int64_t pullTimeNs); 335 336 /* 337 * Records pull delay for a pulled atom, including those served from cache and including statsd 338 * processing delays. 339 */ 340 void notePullDelay(int pullAtomId, int64_t pullDelayNs); 341 342 /* 343 * Records pull exceeds timeout for the puller. 344 */ 345 void notePullTimeout(int pullAtomId); 346 347 /* 348 * Records pull exceeds max delay for a metric. 349 */ 350 void notePullExceedMaxDelay(int pullAtomId); 351 352 /* 353 * Records when system server restarts. 354 */ 355 void noteSystemServerRestart(int32_t timeSec); 356 357 /** 358 * Records statsd skipped an event. 359 */ 360 void noteLogLost(int32_t wallClockTimeSec, int32_t count, int32_t lastError, 361 int32_t lastAtomTag, int32_t uid, int32_t pid); 362 363 /** 364 * Records that the pull of an atom has failed 365 */ 366 void notePullFailed(int atomId); 367 368 /** 369 * Records that the pull of StatsCompanionService atom has failed 370 */ 371 void noteStatsCompanionPullFailed(int atomId); 372 373 /** 374 * Records that the pull of a StatsCompanionService atom has failed due to a failed binder 375 * transaction. This can happen when StatsCompanionService returns too 376 * much data (the max Binder parcel size is 1MB) 377 */ 378 void noteStatsCompanionPullBinderTransactionFailed(int atomId); 379 380 /** 381 * A pull with no data occurred 382 */ 383 void noteEmptyData(int atomId); 384 385 /** 386 * Records that a puller callback for the given atomId was registered or unregistered. 387 * 388 * @param registered True if the callback was registered, false if was unregistered. 389 */ 390 void notePullerCallbackRegistrationChanged(int atomId, bool registered); 391 392 /** 393 * Hard limit was reached in the cardinality of an atom 394 */ 395 void noteHardDimensionLimitReached(int64_t metricId); 396 397 /** 398 * A log event was too late, arrived in the wrong bucket and was skipped 399 */ 400 void noteLateLogEventSkipped(int64_t metricId); 401 402 /** 403 * Buckets were skipped as time elapsed without any data for them 404 */ 405 void noteSkippedForwardBuckets(int64_t metricId); 406 407 /** 408 * An unsupported value type was received 409 */ 410 void noteBadValueType(int64_t metricId); 411 412 /** 413 * Buckets were dropped due to reclaim memory. 414 */ 415 void noteBucketDropped(int64_t metricId); 416 417 /** 418 * A condition change was too late, arrived in the wrong bucket and was skipped 419 */ 420 void noteConditionChangeInNextBucket(int64_t metricId); 421 422 /** 423 * A bucket has been tagged as invalid. 424 */ 425 void noteInvalidatedBucket(int64_t metricId); 426 427 /** 428 * Tracks the total number of buckets (include skipped/invalid buckets). 429 */ 430 void noteBucketCount(int64_t metricId); 431 432 /** 433 * For pulls at bucket boundaries, it represents the misalignment between the real timestamp and 434 * the end of the bucket. 435 */ 436 void noteBucketBoundaryDelayNs(int64_t metricId, int64_t timeDelayNs); 437 438 /** 439 * Number of buckets with unknown condition. 440 */ 441 void noteBucketUnknownCondition(int64_t metricId); 442 443 /* Reports one event has been dropped due to queue overflow, and the oldest event timestamp in 444 * the queue */ 445 void noteEventQueueOverflow(int64_t oldestEventTimestampNs); 446 447 /** 448 * Reports that the activation broadcast guardrail was hit for this uid. Namely, the broadcast 449 * should have been sent, but instead was skipped due to hitting the guardrail. 450 */ 451 void noteActivationBroadcastGuardrailHit(const int uid); 452 453 /** 454 * Reset the historical stats. Including all stats in icebox, and the tracked stats about 455 * metrics, matchers, and atoms. The active configs will be kept and StatsdStats will continue 456 * to collect stats after reset() has been called. 457 */ 458 void reset(); 459 460 /** 461 * Output the stats in protobuf binary format to [buffer]. 462 * 463 * [reset]: whether to clear the historical stats after the call. 464 */ 465 void dumpStats(std::vector<uint8_t>* buffer, bool reset); 466 467 /** 468 * Output statsd stats in human readable format to [out] file descriptor. 469 */ 470 void dumpStats(int outFd) const; 471 472 typedef struct { 473 long totalPull = 0; 474 long totalPullFromCache = 0; 475 long minPullIntervalSec = LONG_MAX; 476 int64_t avgPullTimeNs = 0; 477 int64_t maxPullTimeNs = 0; 478 long numPullTime = 0; 479 int64_t avgPullDelayNs = 0; 480 int64_t maxPullDelayNs = 0; 481 long numPullDelay = 0; 482 long dataError = 0; 483 long pullTimeout = 0; 484 long pullExceedMaxDelay = 0; 485 long pullFailed = 0; 486 long statsCompanionPullFailed = 0; 487 long statsCompanionPullBinderTransactionFailed = 0; 488 long emptyData = 0; 489 long registeredCount = 0; 490 long unregisteredCount = 0; 491 } PulledAtomStats; 492 493 typedef struct { 494 long hardDimensionLimitReached = 0; 495 long lateLogEventSkipped = 0; 496 long skippedForwardBuckets = 0; 497 long badValueType = 0; 498 long conditionChangeInNextBucket = 0; 499 long invalidatedBucket = 0; 500 long bucketDropped = 0; 501 int64_t minBucketBoundaryDelayNs = 0; 502 int64_t maxBucketBoundaryDelayNs = 0; 503 long bucketUnknownCondition = 0; 504 long bucketCount = 0; 505 } AtomMetricStats; 506 507 private: 508 StatsdStats(); 509 510 mutable std::mutex mLock; 511 512 int32_t mStartTimeSec; 513 514 // Track the number of dropped entries used by the uid map. 515 UidMapStats mUidMapStats; 516 517 // The stats about the configs that are still in use. 518 // The map size is capped by kMaxConfigCount. 519 std::map<const ConfigKey, std::shared_ptr<ConfigStats>> mConfigStats; 520 521 // Stores the stats for the configs that are no longer in use. 522 // The size of the vector is capped by kMaxIceBoxSize. 523 std::list<const std::shared_ptr<ConfigStats>> mIceBox; 524 525 // Stores the number of times a pushed atom is logged. 526 // The size of the vector is the largest pushed atom id in atoms.proto + 1. Atoms 527 // out of that range will be put in mNonPlatformPushedAtomStats. 528 // This is a vector, not a map because it will be accessed A LOT -- for each stats log. 529 std::vector<int> mPushedAtomStats; 530 531 // Stores the number of times a pushed atom is logged for atom ids above kMaxPushedAtomId. 532 // The max size of the map is kMaxNonPlatformPushedAtoms. 533 std::unordered_map<int, int> mNonPlatformPushedAtomStats; 534 535 // Maps PullAtomId to its stats. The size is capped by the puller atom counts. 536 std::map<int, PulledAtomStats> mPulledAtomStats; 537 538 // Maps metric ID to its stats. The size is capped by the number of metrics. 539 std::map<int64_t, AtomMetricStats> mAtomMetricStats; 540 541 // Maps uids to times when the activation changed broadcast not sent due to hitting the 542 // guardrail. The size is capped by the number of configs, and up to 20 times per uid. 543 std::map<int, std::list<int32_t>> mActivationBroadcastGuardrailStats; 544 545 struct LogLossStats { LogLossStatsLogLossStats546 LogLossStats(int32_t sec, int32_t count, int32_t error, int32_t tag, int32_t uid, 547 int32_t pid) 548 : mWallClockSec(sec), 549 mCount(count), 550 mLastError(error), 551 mLastTag(tag), 552 mUid(uid), 553 mPid(pid) { 554 } 555 int32_t mWallClockSec; 556 int32_t mCount; 557 // error code defined in linux/errno.h 558 int32_t mLastError; 559 int32_t mLastTag; 560 int32_t mUid; 561 int32_t mPid; 562 }; 563 564 // Max of {(now - oldestEventTimestamp) when overflow happens}. 565 // This number is helpful to understand how SLOW statsd can be. 566 int64_t mMaxQueueHistoryNs = 0; 567 568 // Min of {(now - oldestEventTimestamp) when overflow happens}. 569 // This number is helpful to understand how FAST the events floods to statsd. 570 int64_t mMinQueueHistoryNs = kInt64Max; 571 572 // Total number of events that are lost due to queue overflow. 573 int32_t mOverflowCount = 0; 574 575 // Timestamps when we detect log loss, and the number of logs lost. 576 std::list<LogLossStats> mLogLossStats; 577 578 std::list<int32_t> mSystemServerRestartSec; 579 580 // Stores the number of times statsd modified the anomaly alarm registered with 581 // StatsCompanionService. 582 int mAnomalyAlarmRegisteredStats = 0; 583 584 // Stores the number of times statsd registers the periodic alarm changes 585 int mPeriodicAlarmRegisteredStats = 0; 586 587 void noteConfigResetInternalLocked(const ConfigKey& key); 588 589 void noteConfigRemovedInternalLocked(const ConfigKey& key); 590 591 void resetInternalLocked(); 592 593 void noteDataDropped(const ConfigKey& key, const size_t totalBytes, int32_t timeSec); 594 595 void noteMetricsReportSent(const ConfigKey& key, const size_t num_bytes, int32_t timeSec); 596 597 void noteBroadcastSent(const ConfigKey& key, int32_t timeSec); 598 599 void noteActiveStatusChanged(const ConfigKey& key, bool activate, int32_t timeSec); 600 601 void noteActivationBroadcastGuardrailHit(const int uid, int32_t timeSec); 602 603 void addToIceBoxLocked(std::shared_ptr<ConfigStats>& stats); 604 605 /** 606 * Get a reference to AtomMetricStats for a metric. If none exists, create it. The reference 607 * will live as long as `this`. 608 */ 609 StatsdStats::AtomMetricStats& getAtomMetricStats(int64_t metricId); 610 611 FRIEND_TEST(StatsdStatsTest, TestValidConfigAdd); 612 FRIEND_TEST(StatsdStatsTest, TestInvalidConfigAdd); 613 FRIEND_TEST(StatsdStatsTest, TestConfigRemove); 614 FRIEND_TEST(StatsdStatsTest, TestSubStats); 615 FRIEND_TEST(StatsdStatsTest, TestAtomLog); 616 FRIEND_TEST(StatsdStatsTest, TestNonPlatformAtomLog); 617 FRIEND_TEST(StatsdStatsTest, TestTimestampThreshold); 618 FRIEND_TEST(StatsdStatsTest, TestAnomalyMonitor); 619 FRIEND_TEST(StatsdStatsTest, TestSystemServerCrash); 620 FRIEND_TEST(StatsdStatsTest, TestPullAtomStats); 621 FRIEND_TEST(StatsdStatsTest, TestAtomMetricsStats); 622 FRIEND_TEST(StatsdStatsTest, TestActivationBroadcastGuardrailHit); 623 }; 624 625 } // namespace statsd 626 } // namespace os 627 } // namespace android 628