codez266.github.io/_bibliography/papers.bib at master · codez266/codez266.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
---
---

@string{aps = {American Physical Society,}}

@inproceedings{mehta2020rex,
  title={Rex: Preventing bugs and misconfiguration in large services using correlated change analysis},
  author={Mehta, Sonu and Bhagwan, Ranjita and Kumar, Rahul and Bansal, Chetan and Maddila, Chandra and Ashok, Balasubramanyan and Asthana, Sumit and Bird, Christian and Kumar, Aditya},
  booktitle={17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)},
  pages={435--448},
  year={2020}
}

@inproceedings{10.1145/3338906.3340449,
author = {Asthana, Sumit and Kumar, Rahul and Bhagwan, Ranjita and Bird, Christian and Bansal, Chetan and Maddila, Chandra and Mehta, Sonu and Ashok, B.},
title = {WhoDo: automating reviewer suggestions at scale},
year = {2019},
isbn = {9781450355728},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3338906.3340449},
doi = {10.1145/3338906.3340449},
abstract = {Today's software development is distributed and involves continuous changes for new features and yet, their development cycle has to be fast and agile. An important component of enabling this agility is selecting the right reviewers for every code-change - the smallest unit of the development cycle. Modern tool-based code review is proven to be an effective way to achieve appropriate code review of software changes. However, the selection of reviewers in these code review systems is at best manual. As software and teams scale, this poses the challenge of selecting the right reviewers, which in turn determines software quality over time. While previous work has suggested automatic approaches to code reviewer recommendations, it has been limited to retrospective analysis. We not only deploy a reviewer suggestions algorithm - WhoDo - and evaluate its effect but also incorporate load balancing as part of it to address one of its major shortcomings: of recommending experienced developers very frequently. We evaluate the effect of this hybrid recommendation + load balancing system on five repositories within Microsoft. Our results are based around various aspects of a commit and how code review affects that. We attempt to quantitatively answer questions which are supposed to play a vital role in effective code review through our data and substantiate it through qualitative feedback of partner repositories.},
booktitle = {Proceedings of the 2019 27th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
pages = {937–945},
numpages = {9},
keywords = {software-engineering, recommendation, code-review},
location = {Tallinn, Estonia},
series = {ESEC/FSE 2019}
}

@inproceedings{10.1145/3581641.3584033,
author = {Prabhudesai, Snehal and Yang, Leyao and Asthana, Sumit and Huan, Xun and Liao, Q. Vera and Banovic, Nikola},
title = {Understanding Uncertainty: How Lay Decision-makers Perceive and Interpret Uncertainty in Human-AI Decision Making},
year = {2023},
isbn = {9798400701061},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3581641.3584033},
doi = {10.1145/3581641.3584033},
abstract = {Decision Support Systems (DSS) based on Machine Learning (ML) often aim to assist lay decision-makers, who are not math-savvy, in making high-stakes decisions. However, existing ML-based DSS are not always transparent about the probabilistic nature of ML predictions and how uncertain each prediction is. This lack of transparency could give lay decision-makers a false sense of reliability. Growing calls for AI transparency have led to increasing efforts to quantify and communicate model uncertainty. However, there are still gaps in knowledge regarding how and why the decision-makers utilize ML uncertainty information in their decision process. Here, we conducted a qualitative, think-aloud user study with 17 lay decision-makers who interacted with three different DSS: 1) interactive visualization, 2) DSS based on an ML model that provides predictions without uncertainty information, and 3) the same DSS with uncertainty information. Our qualitative analysis found that communicating uncertainty about ML predictions forced participants to slow down and think analytically about their decisions. This in turn made participants more vigilant, resulting in reduction in over-reliance on ML-based DSS. Our work contributes empirical knowledge on how lay decision-makers perceive, interpret, and make use of uncertainty information when interacting with DSS. Such foundational knowledge informs the design of future ML-based DSS that embrace transparent uncertainty communication.},
booktitle = {Proceedings of the 28th International Conference on Intelligent User Interfaces},
pages = {379–396},
numpages = {18},
keywords = {Decision-making, Machine Learning, Uncertainty.},
location = {Sydney, NSW, Australia},
series = {IUI '23}
}

@article{10.1145/3274290,
author = {Asthana, Sumit and Halfaker, Aaron},
title = {With Few Eyes, All Hoaxes are Deep},
year = {2018},
issue_date = {November 2018},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {2},
number = {CSCW},
url = {https://doi.org/10.1145/3274290},
doi = {10.1145/3274290},
abstract = {Quality control is critical to open production communities like Wikipedia. Wikipedia editors enact border quality control with edits (counter-vandalism) and new article creations (new page patrolling) shortly after they are saved. In this paper, we describe a long-standing set of inefficiencies that have plagued new page patrolling by drawing a contrast to the more efficient, distributed processes for counter-vandalism. Further, to address this issue, we demonstrate an effective automated topic model based on a labeling strategy that leverages a folksonomy developed by subject specific working groups in Wikipedia (WikiProject tags) and a flexible ontology (WikiProjects Directory) to arrive at a hierarchical and uniform label set. We are able to attain very high fitness measures (macro ROC-AUC: 95.2\%, macro PR-AUC: 74.5\%) and real-time performance using word2vec-based features. Finally, we present a proposal for how incorporating this model into current tools will shift the dynamics of new article review positively.},
journal = {Proc. ACM Hum.-Comput. Interact.},
month = nov,
articleno = {21},
numpages = {18},
keywords = {collaborative review, social recommendation, topic modeling, wikipedia}
}

@article{asthana2023summaries,
  title={Summaries, Highlights, and Action items: Design, implementation and evaluation of an LLM-powered meeting recap system},
  author={Asthana, Sumit and Hilleli, Sagih and He, Pengcheng and Halfaker, Aaron},
  issue_date = {April 2025},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {9},
  number = {CSCW2},
  abstract = {Meetings play a critical infrastructural role in coordinating work. The recent surge of hybrid and remote meetings in computer-mediated spaces has led to new problems (e.g., more time spent in less engaging meetings) and new opportunities (e.g., automated transcription/captioning and recap support). Advances in dialogue summarization offer the potential for improving post-meeting experiences, but fixed-length summaries often fail to meet diverse needs, such as quick overviews or detailed insights. To address these gaps, we use cognitive science and discourse theories to conceptualize two recap designs: important highlights and a structured, hierarchical minutes view, targeting complementary recap needs. We operationalize these representations into high-fidelity prototypes using dialogue summarization. Finally, we evaluate the representations' effectiveness with seven users in the context of their work meetings at Microsoft. Our results show both recap types are valuable in different contexts, enabling collaboration through discussions and consensus-building. Exploring the meaning of users adding, editing, and deleting from recaps suggests varying alignment for using these actions to improve AI-recap. Our design implications, such as incorporating organizational artifacts (e.g., linking presentations) in recaps and personalizing context, advance the discourse of effective recap designs for organizational work and support past results from cognition studies.},
  url = {https://arxiv.org/abs/2307.15793},
  doi = {10.48550/arXiv.2307.15793},
  journal = {Proc. ACM Hum.-Comput. Interact.},
  month = apr,
  articleno = {176},
  year={2024},
  selected={true}
}

@inproceedings{10.1145/3613904.3642180,
author = {Asthana, Sumit and Im, Jane and Chen, Zhe and Banovic, Nikola},
title = {"I know even if you don't tell me": Understanding Users' Privacy Preferences Regarding AI-based Inferences of Sensitive Information for Personalization},
year = {2024},
isbn = {9798400703300},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3613904.3642180},
doi = {10.1145/3613904.3642180},
abstract = {Personalization improves user experience by tailoring interactions relevant to each user’s background and preferences. However, personalization requires information about users that platforms often collect without their awareness or their enthusiastic consent. Here, we study how the transparency of AI inferences on users’ personal data affects their privacy decisions and sentiments when sharing data for personalization. We conducted two experiments where participants (N=877) answered questions about themselves for personalized public arts recommendations. Participants indicated their consent to let the system use their inferred data and explicitly provided data after awareness of inferences. Our results show that participants chose restrictive consent decisions for sensitive and incorrect inferences about them and for their answers that led to such inferences. Our findings expand existing privacy discourse to inferences and inform future directions for shaping existing consent mechanisms in light of increasingly pervasive AI inferences.},
booktitle = {Proceedings of the 2024 CHI Conference on Human Factors in Computing Systems},
articleno = {782},
numpages = {21},
keywords = {Personalization, consent., inference, privacy},
location = {Honolulu, HI, USA},
series = {CHI '24},
selected={true}
}

@inproceedings{asthana2023field,
  title={Field experiences and reflections on using LLMs to generate comprehensive lecture metadata},
  author={Asthana, Sumit and Arif, Taimoor and Thompson, Kevyn Collins},
  booktitle={NeurIPS'23 workshop on generative AI for education (GAIED)},
  year={2023}
}

@article{10.1145/3479503,
author = {Asthana, Sumit and Tobar Thommel, Sabrina and Halfaker, Aaron Lee and Banovic, Nikola},
title = {Automatically Labeling Low Quality Content on Wikipedia By Leveraging Patterns in Editing Behaviors},
year = {2021},
issue_date = {October 2021},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {5},
number = {CSCW2},
url = {https://doi.org/10.1145/3479503},
doi = {10.1145/3479503},
abstract = {Wikipedia articles aim to be definitive sources of encyclopedic content. Yet, only 0.6\% of Wikipedia articles have high quality according to its quality scale due to insufficient number of Wikipedia editors and enormous number of articles. Supervised Machine Learning (ML) quality improvement approaches that can automatically identify and fix content issues rely on manual labels of individual Wikipedia sentence quality. However, current labeling approaches are tedious and produce noisy labels. Here, we propose an automated labeling approach that identifies the semantic category (e.g., adding citations, clarifications) of historic Wikipedia edits and uses the modified sentences prior to the edit as examples that require that semantic improvement. Highest-rated article sentences are examples that no longer need semantic improvements. We show that training existing sentence quality classification algorithms on our labels improves their performance compared to training them on existing labels. Our work shows that editing behaviors of Wikipedia editors provide better labels than labels generated by crowdworkers who lack the context to make judgments that the editors would agree with.},
journal = {Proc. ACM Hum.-Comput. Interact.},
month = oct,
articleno = {359},
numpages = {23},
keywords = {content labeling, machine learning, wikipedia}
}

@inproceedings{10.1145/3657604.3664714,
author = {Arif, Taimoor and Asthana, Sumit and Collins-Thompson, Kevyn},
title = {Generation and Assessment of Multiple-Choice Questions from Video Transcripts using Large Language Models},
year = {2024},
isbn = {9798400706332},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3657604.3664714},
doi = {10.1145/3657604.3664714},
abstract = {We present an empirical study evaluating the quality of multiple-choice questions (MCQs) generated by Large Language Models (LLMs) from a corpus of video transcripts of course lectures in an online data science degree program. With our database of thousands of generated questions, we conducted both human and automated judging of question quality on a representative sample using a broad set of criteria, including well-established Item Writing Flaw (IWF) categories. We found the number of average IWFs per MCQ ranged from 1.6 (rule-based verification) to 2.18 (LLM-based). Among the most frequently identified MCQ flaws were lack of enough context (17\%) or answer choices with at least one implausible distractor (57\%). Both human and automated assessment identified implausible distractors as one of the most frequent flaw categories. Results from our human annotation study were generally more positive (51--65\% good items) compared to our automated assessment study results, which tended toward greater flaw identification (15--25\% good items), depending on evaluation method.},
booktitle = {Proceedings of the Eleventh ACM Conference on Learning @ Scale},
pages = {530–534},
numpages = {5},
keywords = {educational video, large language models, question generation},
location = {Atlanta, GA, USA},
series = {L@S '24}
}

@inproceedings{10.1145/3611643.3616248,
author = {Asthana, Sumit and Sajnani, Hitesh and Voyloshnikova, Elena and Acharya, Birendra and Herzig, Kim},
title = {A Case Study of Developer Bots: Motivations, Perceptions, and Challenges},
year = {2023},
isbn = {9798400703270},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3611643.3616248},
doi = {10.1145/3611643.3616248},
abstract = {Continuous integration and deployment (CI/CD) is now a widely adopted development model in practice as it reduces the time from ideas to customers. This adoption has also revived the idea of "shifting left" during software development -- a practice intended to find and prevent defects early in the software delivery process. To assist with that, engineering systems integrate developer bots in the development workflow to improve developer productivity and help them identify issues early in the software delivery process.

In this paper, we present a case study of developer bots in Microsoft. We identify and analyze 23 developer bots that are deployed across 13,000 repositories and assist about 6,000 developers daily in their CI/CD software development workflows. We classify these bots across five major categories: Config Violation, Security, Data-privacy, Developer Productivity, and Code Quality. By conducting interviews and surveys with bot developers and bot users and by analyzing about half a million historical bot actions spanning over one and a half years, we present software workflows that motivate bot instrumentation, factors impacting their usefulness as perceived by bot users, and challenges associated with their use. Our findings echo existing issues with bots, such as noise, and illustrate new benefits (e.g., cross-team communication) and challenges (e.g., too many bots) for large software teams.},
booktitle = {Proceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
pages = {1268–1280},
numpages = {13},
keywords = {devbots, developer productivity, recommendations, software maintenance, software quality},
location = {San Francisco, CA, USA},
series = {ESEC/FSE 2023}
}

@inproceedings{asthana-etal-2024-evaluating,
    title = "Evaluating {LLM}s for Targeted Concept Simplification for Domain-Specific Texts",
    author = "Asthana, Sumit  and
      Rashkin, Hannah  and
      Clark, Elizabeth  and
      Huot, Fantine  and
      Lapata, Mirella",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.357/",
    doi = "10.18653/v1/2024.emnlp-main.357",
    pages = "6208--6226",
    abstract = "One useful application of NLP models is to support people in reading complex text from unfamiliar domains (e.g., scientific articles). Simplifying the entire text makes it understandable but sometimes removes important details. On the contrary, helping adult readers understand difficult concepts in context can enhance their vocabulary and knowledge. In a preliminary human study, we first identify that lack of context and unfamiliarity with difficult concepts is a major reason for adult readers' difficulty with domain-specific text. We then introduce targeted concept simplification, a simplification task for rewriting text to help readers comprehend text containing unfamiliar concepts. We also introduce WikiDomains, a new dataset of 22k definitions from 13 academic domains paired with a difficult concept within each definition. We benchmark the performance of open-source and commercial LLMs and a simple dictionary baseline on this task across human judgments of ease of understanding and meaning preservation. Interestingly, our human judges preferred explanations about the difficult concept more than simplifications of the concept phrase. Further, no single model achieved superior performance across all quality dimensions, and automated metrics also show low correlations with human evaluations of concept simplification ({\textasciitilde}0.2), opening up rich avenues for research on personalized human reading comprehension support.",
    selected={true}
}

@misc{bhagwan2023detecting,
  title={Detecting misconfiguration and/or bug (s) in large service (s) using correlated change analysis},
  author={Bhagwan, Ranjita and Maddila, Chandra Sekhar and Kumar, Aditya and Asthana, Sumit and Kumar, Rahul and Mehta, Sonu and Bansal, Chetan and Ashok, Balasubramanyan and Bird, Christian Alma},
  year={2023},
  month=mar # "~7",
  publisher={Google Patents},
  note={US Patent 11,599,354}
}

@article{ion2025adaptive,
  title={Adaptive Knowledge Assessment in Simulated Coding Interviews},
  author={Ion, Michael and Asthana, Sumit and Jiao, Fengquan and Wang, Tianyi and Collins-Thompson, Kevyn},
  journal={Proceedings of Machine Learning Research},
  volume={273},
  pages={1--3},
  year={2025}
}