33title: Publications
44---
55< section class ="section-margin ">
6- < div class ="container ">
6+ < div class ="container ">
77 < h2 id ="publications "> 2025</ h2 >
88 < ul class ="publications ">
99 < li >
10- < a target ="_blank " href ="paper/traincheck-osdi25-preprint.pdf "> Training with Confidence: Catching Silent Errors in Deep Learning Training with Automated Proactive Checks/a> < br >
10+ < a target ="_blank " href ="# "> Verifying Distributed Deep Learning Training via Parallelization Equivalence</ a > < br >
11+ < span class ="authorlist "> < i > < a href ="https://mercury-browser-ede.notion.site/yunchi " class ="nodec "> Yunchi Lu</ a > , </ i > < i > < a href ="https://naizhengtan.github.io " class ="nodec "> Cheng Tan</ a > , </ i > < i > < a href ="https://www.microsoft.com/en-us/research/people/yomia " class ="nodec "> Youshan Miao</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > < a href ="https://www.microsoft.com/en-us/research/people/yizhu1 " class ="nodec "> Yi Zhu</ a > , </ i > < i > < a href ="https://www.microsoft.com/en-us/research/people/zhxian " class ="nodec "> Xian Zhang</ a > , </ i > < i > < a href ="https://www.microsoft.com/en-us/research/people/fanyang " class ="nodec "> Fan Yang</ a > < br > </ i > </ span >
12+ < a target ="_blank " href ="https://sigops.org/s/conferences/sosp/2025/ " class ="conf "> < b > SOSP 2025</ b > </ a >
13+ </ li >
14+ < li >
15+ < a target ="_blank " href ="# "> Optimistic Recovery for High-Availability Software via Partial Process State Preservation</ a > < br >
16+ < span class ="authorlist "> < i > < a href ="https://osdi.dev " class ="nodec "> Yuzhuo Jing</ a > , </ i > < i > Yuqi Mai, </ i > < i > Angting Cai, </ i > < i > < a href ="https://chenyi.world " class ="nodec "> Yi Chen</ a > , </ i > < i > < a href ="https://hwanning.netlify.app " class ="nodec "> Wanning He</ a > , </ i > < i > Xiaoyang Qian, </ i > < i > < a href ="https://web.eecs.umich.edu/~pmchen " class ="nodec "> Peter M. Chen</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
17+ < a target ="_blank " href ="https://sigops.org/s/conferences/sosp/2025/ " class ="conf "> < b > SOSP 2025</ b > </ a >
18+ </ li >
19+ < li >
20+ < a target ="_blank " href ="# "> Mitigating Application Resource Overload with Targeted Task Cancellation</ a > < br >
21+ < span class ="authorlist "> < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > Zeyin Zhang, </ i > < i > Yicheng Liu, </ i > < i > Yile Gu, </ i > < i > Shuangyu Lei, </ i > < i > < a href ="https://homes.cs.washington.edu/~baris " class ="nodec "> Baris Kasikci</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
22+ < a target ="_blank " href ="https://sigops.org/s/conferences/sosp/2025/ " class ="conf "> < b > SOSP 2025</ b > </ a >
23+ </ li >
24+ < li >
25+ < a target ="_blank " href ="paper/traincheck-osdi25-preprint.pdf "> Training with Confidence: Catching Silent Errors in Deep Learning Training with Automated Proactive Checks</ a > < br >
1126 < span class ="authorlist "> < i > < a href ="https://essoz.github.io " class ="nodec "> Yuxuan Jiang</ a > , </ i > < i > Ziming Zhou, </ i > < i > Boyu Xu, </ i > < i > Beijie Liu, </ i > < i > Runhui Xu, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
12- < a target ="_blank " href ="https://www.usenix.org/conference/osdi25 " class ="conf "> < b > OSDI 2025</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/traincheck-osdi25.bib "> BibTeX</ a >
13- < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/TrainCheck "> Software</ a >
27+ < a target ="_blank " href ="https://www.usenix.org/conference/osdi25 " class ="conf "> < b > OSDI 2025</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/traincheck-osdi25.bib "> BibTeX</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/TrainCheck "> Software</ a > < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="https://www.arxiv.org/abs/2506.14813 "> [ArXiv]</ a >
1428 </ li >
1529 < li >
16- < a target ="_blank " href ="# "> Deriving Semantic Checkers from Tests to Detect Silent Failures in Production Distributed Systems</ a > < br >
30+ < a target ="_blank " href ="paper/t2c-osdi25-preprint.pdf "> Deriving Semantic Checkers from Tests to Detect Silent Failures in Production Distributed Systems</ a > < br >
1731 < span class ="authorlist "> < i > < a href ="https://www.cs.jhu.edu/~chlou/about " class ="nodec "> Chang Lou</ a > , </ i > < i > Dimas Shidqi Parikesit, </ i > < i > Yujin Huang, </ i > < i > Zhewen Yang, </ i > < i > Senapati Diwangkara, </ i > < i > < a href ="https://osdi.dev " class ="nodec "> Yuzhuo Jing</ a > , </ i > < i > Achmad Imam Kistijantoro, </ i > < i > < a href ="http://www.eecg.toronto.edu/~yuan " class ="nodec "> Ding Yuan</ a > , </ i > < i > < a href ="https://www.microsoft.com/en-us/research/people/sumann " class ="nodec "> Suman Nath</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
18- < a target ="_blank " href ="https://www.usenix.org/conference/osdi25 " class ="conf "> < b > OSDI 2025</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/T2C "> Software</ a >
32+ < a target ="_blank " href ="https://www.usenix.org/conference/osdi25 " class ="conf "> < b > OSDI 2025</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/t2c-osdi25.bib " > BibTeX </ a > < a target =" _blank " class =" btn btn-outline-primary publinkitem " href =" https://github.com/OrderLab/T2C "> Software</ a >
1933 </ li >
2034 < li >
2135 < a target ="_blank " href ="paper/xinda-nsdi25-preprint.pdf "> One-Size-Fits-None: Understanding and Enhancing Slow-Fault Tolerance in Modern Distributed Systems</ a > < br >
@@ -50,13 +64,13 @@ <h2 id="publications">2023</h2>
5064 </ li >
5165 < li >
5266 < a target ="_blank " href ="paper/pbox-sosp23.pdf "> Pushing Performance Isolation Boundaries into Application with pBox</ a > < br >
53- < span class ="authorlist "> < i > < a href ="https://www.cs.jhu.edu/~yigonghu " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://gongqihuang.com " class ="nodec "> Gongqi Huang</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
67+ < span class ="authorlist "> < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://gongqihuang.com " class ="nodec "> Gongqi Huang</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
5468 < a target ="_blank " href ="https://sosp2023.mpi-sws.org " class ="conf "> < b > SOSP 2023</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/pbox-sosp23.bib "> BibTeX</ a >
5569 < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="slides/pbox_sosp23_slides.pdf "> Slides</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/pBox "> Software</ a >
5670 </ li >
5771 < li >
5872 < a target ="_blank " href ="paper/vprof-eurosys23.pdf "> Effective Performance Issue Diagnosis with Value-Assisted Cost Profiling</ a > < br >
59- < span class ="authorlist "> < i > Lingmei Weng, </ i > < i > < a href ="https://www.cs.jhu.edu/~yigonghu " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > < a href ="http://www.cs.columbia.edu/~nieh " class ="nodec "> Jason Nieh</ a > , </ i > < i > < a href ="http://www.cs.columbia.edu/~junfeng " class ="nodec "> Junfeng Yang</ a > < br > </ i > </ span >
73+ < span class ="authorlist "> < i > Lingmei Weng, </ i > < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > < a href ="http://www.cs.columbia.edu/~nieh " class ="nodec "> Jason Nieh</ a > , </ i > < i > < a href ="http://www.cs.columbia.edu/~junfeng " class ="nodec "> Junfeng Yang</ a > < br > </ i > </ span >
6074 < a target ="_blank " href ="https://2023.eurosys.org " class ="conf "> < b > EuroSys 2023</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/vprof-eurosys23.bib "> BibTeX</ a >
6175 < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="slides/vprof_eurosys23_slides.pdf "> Slides</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/wenglingmei/vprofAE "> Software</ a >
6276 </ li >
@@ -104,7 +118,7 @@ <h2 id="publications">2020</h2>
104118 < ul class ="publications ">
105119 < li >
106120 < a target ="_blank " href ="paper/violet-osdi20.pdf "> Automated Reasoning and Detection of Specious Configuration in Large Systems with Symbolic Execution</ a > < br >
107- < span class ="authorlist "> < i > < a href ="https://www.cs.jhu.edu/~yigonghu " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://gongqihuang.com " class ="nodec "> Gongqi Huang</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
121+ < span class ="authorlist "> < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://gongqihuang.com " class ="nodec "> Gongqi Huang</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
108122 < a target ="_blank " href ="https://www.usenix.org/conference/osdi20 " class ="conf "> < b > OSDI 2020</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/violet-osdi20.bib "> BibTeX</ a >
109123 < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="slides/violet_osdi20_slides.pdf "> Slides</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://github.com/OrderLab/violet "> Software</ a > < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="paper/violet-tech-report.pdf "> TechReport</ a >
110124 </ li >
@@ -126,7 +140,7 @@ <h2 id="publications">2020</h2>
126140 </ li >
127141 < li >
128142 < a target ="_blank " href ="paper/sdig-aaai20-workshop.pdf "> Scaling Performance Issue Detection and Diagnosis in Cloud Infrastructures</ a > < br >
129- < span class ="authorlist "> < i > < a href ="https://www.cs.jhu.edu/~yigonghu " class ="nodec "> Yigong Hu</ a > , </ i > < i > Ze Li, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > Suhas Pinnamaneni, </ i > < i > Francis David, </ i > < i > Yingnong Dang, </ i > < i > Murali Chintalapati< br > </ i > </ span >
143+ < span class ="authorlist "> < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > Ze Li, </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > , </ i > < i > Suhas Pinnamaneni, </ i > < i > Francis David, </ i > < i > Yingnong Dang, </ i > < i > Murali Chintalapati< br > </ i > </ span >
130144 < a target ="_blank " href ="https://cloudintelligenceworkshop.org " class ="conf "> < b > AAAI-20 Workshop on Cloud Intelligence</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/sdig-aaai20.bib "> BibTeX</ a >
131145 </ li >
132146
@@ -146,7 +160,7 @@ <h2 id="publications">2019</h2>
146160 </ li >
147161 < li >
148162 < a target ="_blank " href ="paper/leaseos-asplos19.pdf "> A Case for Lease-Based, Utilitarian Resource Management on Mobile Devices</ a > < b style ="color:green "> [Best Paper Award]</ b > < br >
149- < span class ="authorlist "> < i > < a href ="https://www.cs.jhu.edu/~yigonghu " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://sylll.github.io " class ="nodec "> Suyi Liu</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
163+ < span class ="authorlist "> < i > < a href ="https://yigonghu.github.io " class ="nodec "> Yigong Hu</ a > , </ i > < i > < a href ="https://sylll.github.io " class ="nodec "> Suyi Liu</ a > , </ i > < i > < a href ="https://web.eecs.umich.edu/~ryanph " class ="nodec "> Peng Huang</ a > < br > </ i > </ span >
150164 < a target ="_blank " href ="https://asplos-conference.org " class ="conf "> < b > ASPLOS 2019</ b > </ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="paper/leaseos.bib "> BibTeX</ a >
151165 < a target ="_blank " role ="button " class ="btn btn-outline-primary publinkitem " href ="slides/leaseos_asplos19_slides.pptx "> Slides</ a > < a target ="_blank " class ="btn btn-outline-primary publinkitem " href ="https://orderlab.io/LeaseOS "> Software</ a > < br > < div class ="press "> < b > Coverage:</ b > < a target ="_blank " href ="https://blog.acolyer.org/2019/05/31/lease-os "> The Morning Paper</ a > </ div >
152166 </ li >
@@ -280,5 +294,5 @@ <h2 id="publications">2010</h2>
280294 </ li >
281295
282296 </ ul >
283- </ div >
297+ </ div >
284298</ section >
0 commit comments