@inproceedings{030801f0f44c42bc9fee8f53a254189f,
title = "Spatiotemporal Real-Time Anomaly Detection for Supercomputing Systems",
abstract = "The demands of increasingly large scientific application workflows lead to the need for more powerful supercomputers. As the scale of supercomputing systems have grown, the prediction of fault tolerance has become an increasingly critical area of study, since the prediction of system failures can improve performance by saving checkpoints in advance. We propose a real-time failure detection algorithm that adopts an event-based prediction model. The prediction model is a convolutional neural network that utilizes both traditional event attributes and additional spatio-temporal features. We present a case study using our proposed method with six years of reliability, availability, and serviceability event logs recorded by Mira, a Blue Gene/Q supercomputer at Argonne National Laboratory. In the case study, we have shown that our failure prediction model is not limited to predict the occurrence of failures in general. It is capable of accurately detecting specific types of critical failures such as coolant and power problems within reasonable lead time ranges. Our case study shows that the proposed method can achieve a F1 score of 0.56 for general failures, 0.97 for coolant failures, and 0.86 for power failures.",
keywords = "Blue Gene/Q, RAS, system anomaly detection",
author = "Qiao Kang and Ankit Agrawal and Alok Choudhary and Alex Sim and Kesheng Wu and Rajkumar Kettimuthu and Beckman, {Peter H.} and Zhengchun Liu and Liao, {Wei Keng}",
note = "Funding Information: This work was supported in part by the Office of Advanced Scientific Computing Research, Office of Science, of the U.S. Department of Energy under Contract No. DEAC02- 06CH11357 Funding Information: This work was supported in part by the Office of Advanced Scientific Computing Research, Office of Science, of the U.S. Department of Energy under Contract No. DE-AC02-06CH11357, DE-AC02-05CH11231, DE-SC0014330 and DE-SC0019358. The RAS event data we used in this paper was generated from resources of the Argonne Leadership Computing Facility, which is a DOE Office of Science User Facility supported under Contract DE-AC02-06CH11357. This research also used resources of the National Energy Research Scientific Computing Center. Publisher Copyright: {\textcopyright} 2019 IEEE.; 2019 IEEE International Conference on Big Data, Big Data 2019 ; Conference date: 09-12-2019 Through 12-12-2019",
year = "2019",
month = dec,
doi = "10.1109/BigData47090.2019.9006046",
language = "English (US)",
series = "Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "4381--4389",
editor = "Chaitanya Baru and Jun Huan and Latifur Khan and Hu, {Xiaohua Tony} and Ronay Ak and Yuanyuan Tian and Roger Barga and Carlo Zaniolo and Kisung Lee and Ye, {Yanfang Fanny}",
booktitle = "Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019",
address = "United States",
}