351 lines
60 KiB
HTML
351 lines
60 KiB
HTML
<!DOCTYPE html> <html style lang=en><!--
|
|
Page saved with SingleFile
|
|
url: https://arxiv.org/abs/2406.09155
|
|
saved date: Sun Jan 11 2026 22:52:54 GMT+0100 (Central European Standard Time)
|
|
--><meta charset=utf-8>
|
|
<title>[2406.09155] DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation</title>
|
|
<meta name=viewport content="width=device-width, initial-scale=1">
|
|
<meta name=msapplication-TileColor content=#da532c>
|
|
<meta name=theme-color content=#ffffff>
|
|
<style media=screen>body{margin:0;padding:0;background-color:#fff;color:#000;font-family:"Lucida Grande",helvetica,arial,verdana,sans-serif}a:link,a:visited,a:active{text-decoration:none;font-weight:normal}a:hover{text-decoration:underline}img{border:0}em{font-weight:bold;font-style:normal}.primary-subject{font-weight:bold}main{flex-grow:1}.flex-wrap-footer{display:flex;min-height:100vh;flex-direction:column}footer ul li{display:flex;align-items:center;font-size:14px}footer ul li a{font-size:13.5px}footer{background-color:hsl(0,0%,95%);color:#000;padding:1em 2em;font-size:0.9rem;-webkit-font-smoothing:antialiased;margin-top:6rem}footer a,footer a:visited{color:#000;text-decoration:none;border-bottom:1px solid transparent;line-height:1.75em}footer a:hover,footer a:active{color:#005e9d;border-bottom:1px dotted #005e9d;text-decoration:none}footer ul{padding:0;margin:0}footer .sorry-app-links .help{font-size:0.75rem;margin-bottom:0;line-height:1.75em}footer .sorry-app-links .help a,footer .sorry-app-links .help a:visited{border-bottom:1px dotted #000}footer .sorry-app-links .help a:hover,footer .sorry-app-links .help a:active{border-bottom:1px dotted #005e9d}footer .sorry-app-links svg.icon{margin-bottom:-2px!important}footer .sorry-app-links .a11y-main-link{font-size:110%;border-bottom:1px solid transparent!important;padding:0;margin:0}@media screen and (max-width:768px){footer .sorry-app-links.column{padding:0}}@media screen and (min-width:769px){.columns{display:flex;flex-direction:row}}.icon{width:.9rem;margin-right:.45em;margin-top:-.15rem}.help{font-family:"Lucida Grande","Helvetica Neue",Helvetica,Arial,sans-serif;display:block;margin-top:0.25rem}#content,#content-inner{margin:.7em;font-size:90%}#abs-outer,#abs{margin:-0.7em}#abs-outer .leftcolumn{margin:0 0 1em 0;padding:0px;width:calc(100% - 18em);float:left}#abs-outer .mobile-submission-download{display:none}#abs-outer .extra-services{float:right;margin:0;width:18em}#abs-outer .extra-services span.bib-cite-button{color:rgb(0,0,238);font-weight:bold;padding:.35em;display:block;text-transform:capitalize}#abs-outer .extra-services span.bib-cite-button:hover,#abs-outer .extra-services span.bib-cite-button:focus{text-decoration:underline;cursor:pointer}#labstabs{clear:both;margin:1em 1.5em}#labstabs .labstabs{display:flex;flex-wrap:wrap;white-space:normal;justify-content:flex-start}#labstabs .labstabs>label{order:1;display:block;padding:.75em 1.5em;font-size:.85em;color:#4a4a4a;margin-right:0.2rem;cursor:pointer;font-weight:bold;transition:background ease 0.2s;border-radius:6px 6px 0 0;background-color:#eef5f9}#labstabs .labstabs .tab{order:99;flex-grow:1;width:100%;display:none;padding:1rem;border:1px solid #828282;z-index:1;top:-1px;position:relative}#labstabs .labstabs .tab h1{font-size:1.25em;font-weight:normal}#labstabs .labstabs input[type="radio"]:checked+label{background:#ffffff;border:1px solid #828282;border-bottom:0;z-index:2;color:black}#labstabs .labstabs input[type="radio"]:checked+label+.tab{display:block}@media (max-width:45em){#labstabs{padding-bottom:1em;margin:1em}#labstabs .labstabs .tab,#labstabs .labstabs label{order:initial;width:100%;margin-right:0;margin-top:0.2rem;border-radius:0}#labstabs .labstabs>label{font-size:.8em}#labstabs .labstabs .tab{margin-top:-10px!important;border:2px solid #b8b8b8;border-top:0px;background-color:#ffffff}#labstabs .labstabs label{background-color:#eef5f9;color:black;padding:1em .5em}#labstabs .labstabs .labs-display-bib label{background:transparent;color:black;padding:0}#labstabs .labstabs input[type="radio"]:checked+label{background-color:#acd2e8;color:black;border:2px solid #b8b8b8;border-bottom:0px;padding:1em .5em}}#labstabs .toggle{border:1px dotted #c7d3db;padding:.5em;background-color:#eef5f9}#labstabs .toggle .columns.lab-row{align-items:center}#labstabs .toggle .columns.lab-row .column{padding:.5rem .25rem}#labstabs .column.lab-name{flex-basis:auto;flex-grow:0;font-size:.75rem}#labstabs .tab a{display:inline}#labstabs .column.lab-name em{font-size:11px;color:black}#labstabs .column.lab-name em a{line-height:18px;text-decoration:none;padding:0;border:0;font-style:normal;font-size:11px}#labstabs .column.lab-switch{flex-grow:0}#labstabs .toggle .lab-switch label.switch{position:relative;display:inline-block;width:38px;height:20px;padding:0;background-color:transparent;margin:0}#labstabs .toggle .lab-switch .slider{position:absolute;cursor:pointer;top:0;left:0;right:0;bottom:0;border-radius:24px;background-color:#ccc;-webkit-transition:.4s;transition:.4s}#labstabs .toggle .lab-switch .slider:before{position:absolute;content:"";height:16px;width:16px;left:2px;bottom:2px;border-radius:50%;background-color:white;-webkit-transition:.4s;transition:.4s}#abs-outer .subheader{background-color:#eee;color:#000;padding:.25em 0;border-bottom:1px solid #ccc}#abs-outer .subheader h1{margin:0;font-size:1.1em;padding:0 0 .2em 20px;font-weight:bold;font-style:normal}#abs h1.title{margin:.5em 0 .5em 20px;font-size:x-large;font-weight:bold;line-height:120%}#abs .authors{margin:.5em 0 .5em 20px;font-size:medium;line-height:150%}#abs .authors a{font-size:medium}#abs .dateline{margin:.5em 0 .5em 20px;font-style:italic;font-size:small}#abs blockquote.abstract{line-height:1.55;font-size:1.05em;margin-bottom:1.5em}#abs .metatable{font-size:0.92em;line-height:1.5;margin:0 0 1.5em 20px}#abs .tablecell{padding:.1em .5em 0em 0em;vertical-align:top}#abs .arxivid a{font-weight:normal}#abs-outer .submission-history{margin:1.5em 0 1.5em 20px;font-size:90%;line-height:1.5em}#abs-outer .submission-history h2{font-size:120%;margin:0 0 .25em 0;font-weight:bold}#abs-outer .endorsers{margin:1em 0 1.5em 20px;font-size:small;font-style:italic;clear:both}#abs-outer .header-breadcrumbs-mobile{display:none}#abs-outer #abs .dateline{margin-top:15px;margin-bottom:0}#abs-outer #abs h1.title{margin-top:.25em}@media screen and (max-width:768px){#abs-outer .header-breadcrumbs-mobile{display:block;color:black;font-size:.85em;margin:.25em 0 .5em 1em}#abs-outer #abs a.mobile-submission-download{display:flex;justify-content:center;margin:.7em .25em;border-radius:10px;background-color:#408bd0;padding:.25em 1em;font-weight:800;color:white;text-decoration:none;font-size:20px;text-align:center}#abs-outer .extra-services h2{font-size:15px;margin-bottom:.5em}#abs-outer .extra-services h3{font-size:14px}#abs-outer .leftcolumn,#abs-outer .extra-services{width:100%;float:left}#abs-outer .extra-services,#abs-outer .extra-services .full-text,#abs-outer .extra-services .browse,#abs-outer .extra-services .extra-ref-cite{border:0px;font-size:12px}#abs-outer .extra-services{-webkit-box-shadow:inset 0px 8px 15px 0px rgba(173,173,173,1);-moz-box-shadow:inset 0px 8px 15px 0px rgba(173,173,173,1);box-shadow:inset 0px 8px 15px 0px rgba(173,173,173,1);background-color:#E6E6E6;margin:0 0 1em 0;padding:1em 0}#abs-outer .extra-ref-cite ul li{float:left}#abs-outer .extra-services .full-text ul,#abs-outer .extra-services .extra-ref-cite ul{list-style:none;margin:0;padding:0}#abs-outer .extra-services .full-text ul li,#abs-outer .extra-services .extra-ref-cite ul li{display:inline-block;margin:0 0 .25em 0;padding:0}#abs-outer .extra-services .bookmarks{margin:1em 0 0 0;border-left:0;padding:.25em .5em 0 1em;border-top:2px solid #cccccc;font-size:1em}#abs-outer .extra-services .bookmarks .abs-button-small{margin-top:.25em}#abs-outer .extra-services .browse{margin-top:.5em;border-top:2px solid #cccccc;padding-top:1em}#abs-outer .extra-services .prevnext{margin-top:.5em}#abs-outer .extra-services .browse .current{color:#AB4B02;display:inline}.abs-switch-cat{margin:0 0 1em 0}.browse .abs-switch-cat .switch{display:inline}.browse .abs-switch-cat .switch a{font-weight:bold}.browse .abs-switch-cat .switch .subclass{padding:0;margin-left:-3px}.abs-switch-cat .subclass:before{content:", "}.abs-button{display:inline-block;border-radius:5px;border:1px solid #046BAF;font-size:1.25em;color:#046BAF!important;padding:.5em;background:#E6E6E6;margin-right:.3em}.abs-button-small{font-size:1em;padding:.25em 1em;margin:.75em .5em 0 0}.abs-button-grey{border:1px solid #666666;color:#666666!important}#abs-outer .extra-services span.bib-cite-button{margin:.5em .5em 0 0;display:inline-block!important;border-radius:5px;background:#E6E6E6;border:1px solid #046BAF;font-size:1em!important;padding:.25em 1em;font-weight:normal;text-transform:capitalize}#abs-outer .extra-services .extra-ref-cite ul li{margin:.5em 0;padding:0;height:auto}#abs-outer .extra-services .extra-ref-cite ul li a{margin:.25em .5em .25em 0}#abs-outer .subheader{background-color:#fefefe;padding:.25em 0;border-bottom:1px solid #ccc}#abs-outer .subheader h1{margin:0;font-size:.75em;padding:.2em 0 .2em 1em;font-weight:normal;font-style:normal;color:#b55c06}#abs-outer #abs .dateline{color:#767676;font-size:.85em;font-style:normal;margin:2em 0 0 1em}#abs-outer .submission-history{padding:1em;margin:0;background-color:#f5f5f5;-webkit-box-shadow:inset 0px -6px 15px 0px rgba(219,219,219,1);-moz-box-shadow:inset 0px -6px 15px 0px rgba(219,219,219,1);box-shadow:inset 0px -6px 15px 0px rgba(219,219,219,1)}#abs-outer #abs h1.title{margin:0 .25em 0 .5em;font-size:1.5em}#abs-outer #abs .authors{margin:1em .25em 0 1em;font-size:.9em;line-height:1.5em}#abs-outer #abs .authors a{font-size:inherit}#abs-outer #abs blockquote.abstract{margin:0 1em}#abs-outer #abs .metatable{margin:.75em 0 1.5em 1.5em}#abs-outer #abs a,#abs-outer a,#abs-outer .endorsers a{color:#1777bc}#abs-outer .submission-history a,#abs-outer .abs-switch-cat .switch a,#abs-outer .extra-services .full-text .abs-license a{color:#046BAF}#abs-outer .endorsers{display:block;float:left;border:1px solid #eee;padding:1em;margin:1em}}.mobile-header{background-color:#b31b1b}.mobile-header .columns{height:65px;align-items:center}.mobile-header .column{border-left:1px solid #fc5554;border-right:1px solid #731515;height:65px;padding:0 1em;display:flex;align-items:center}.mobile-header .column:first-child{border-left:0}.mobile-header .column:last-child{border-right:0}.mobile-header .column.logo-arxiv{width:100px;flex:none}.mobile-header .column.logo-cornell{display:flex}.mobile-header .column.logo-cornell img{height:45px}.mobile-header .column.nav{justify-content:flex-end;align-self:flex-end}.mobile-header #toggle-container button.toggle-control{background-color:transparent;border-radius:0;border:0;font-size:25px;padding:3px;margin-left:.5em}.mobile-header #toggle-container button.toggle-control svg.icon{width:1.25rem;margin:0}@media screen and (min-width:769px){.mobile-header{display:none}}@media screen and (min-width:426px){.mobile-header .column.nav{flex:none;width:100px}}@media screen and (max-width:500px){.mobile-header .columns{height:80px}.mobile-header .column{height:80px;padding:0 .5em}.mobile-header .column.logo-arxiv{border-right:0!important}.mobile-header .column.logo-cornell{justify-content:flex-end;border-left:0!important}.mobile-header .column.logo-cornell img{height:73px}.mobile-header .column.nav{width:65px;flex:none}}@media screen and (min-width:501px){.mobile-header .column{height:65px}.mobile-header .column.logo-cornell img{height:45px}.mobile-header .column.nav{width:65px}}.extra-services{border-bottom:.35em solid #ddd}.extra-services h3{font-size:medium;font-weight:normal;margin:0 0 0.3em 0;padding-top:0.3em}.full-text{margin:0;padding:.5em 1em .5em 1em;font-size:110%;font-weight:normal;border-bottom:medium solid #ddd;border-left:.35em solid #ddd}.full-text h2{font-size:140%;font-weight:bold;margin:0.1em 0 0 0}.full-text ul{margin:.3em 0 0 1em;padding:0;list-style-type:none}.extra-ref-cite{margin:0;padding:0 1em 0 1em;font-size:90%;border-bottom:medium solid #ddd;border-left:.35em solid #ddd}.extra-ref-cite ul{font-weight:normal;margin:0.3em 0 0 20px;padding-left:0;padding-bottom:0.3em;list-style-type:none}.browse{padding:0 1em 0 1em;font-size:90%;border-bottom:medium solid #ddd;border-left:.35em solid #ddd}.browse .current{padding:0;font-weight:bold}.browse .prevnext{padding:0.2em 0 0 0}.browse .list{padding:0.2em 0 0.5em 0;font-weight:normal}.browse .switch{font-weight:normal;padding:.2em 0em .7em 0em}.browse .switch .subclass{padding-left:1.5em}.bookmarks{clear:both;margin:0;padding:0 1em .5em 1em;font-size:90%;border-left:.35em solid #ddd}.abs-license{font-size:xx-small;padding-top:0.3em}.abs-license .has_license{display:flex;align-items:center;gap:5px}@media screen and (max-width:768px){.columns.is-mobile{display:flex}}.is-sr-only{border:none!important;clip:rect(0,0,0,0)!important;height:0.01em!important;overflow:hidden!important;padding:0!important;position:absolute!important;white-space:nowrap!important;width:0.01em!important}.column{display:block;flex-basis:0;flex-grow:1;flex-shrink:1;padding:0.75rem}@media screen and (min-width:769px),print{.columns:not(.is-desktop){display:flex}}@media screen and (min-width:1024px){.columns.is-desktop{display:flex}}svg.icon{height:1em!important}.icon.filter-white{fill:#FFFFFF}.icon.filter-black{fill:#000000}.filter-dark_grey{fill:#cccccc}a .icon{transition:fill 0.3s ease}</style>
|
|
<link rel=canonical href=https://arxiv.org/abs/2406.09155>
|
|
<meta name=description content="Abstract page for arXiv paper 2406.09155: DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation"><meta property=og:type content=website>
|
|
<meta property=og:site_name content=arXiv.org>
|
|
<meta property=og:title content="DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation">
|
|
<meta property=og:url content=https://arxiv.org/abs/2406.09155v1>
|
|
<meta property=og:image content=/static/browse/0.3.4/images/arxiv-logo-fb.png>
|
|
<meta property=og:image:secure_url content=/static/browse/0.3.4/images/arxiv-logo-fb.png>
|
|
<meta property=og:image:width content=1200>
|
|
<meta property=og:image:height content=700>
|
|
<meta property=og:image:alt content="arXiv logo">
|
|
<meta property=og:description content="Large Language Models (LLMs) have demonstrated remarkable capabilities, revolutionizing the integration of AI in daily life applications. However, they are prone to hallucinations, generating claims that contradict established facts, deviating from prompts, and producing inconsistent responses when the same prompt is presented multiple times. Addressing these issues is challenging due to the lack of comprehensive and easily assessable benchmark datasets. Most existing datasets are small and rely on multiple-choice questions, which are inadequate for evaluating the generative prowess of LLMs. To measure hallucination in LLMs, this paper introduces a comprehensive benchmark dataset comprising over 75,000 prompts across eight domains. These prompts are designed to elicit definitive, concise, and informative answers. The dataset is divided into two segments: one publicly available for testing and assessing LLM performance and a hidden segment for benchmarking various LLMs. In our experiments, we tested six LLMs-GPT-3.5, LLama 2, LLama 3, Gemini, Mixtral, and Zephyr-revealing that overall factual hallucination ranges from 59% to 82% on the public dataset and 57% to 76% in the hidden benchmark. Prompt misalignment hallucination ranges from 6% to 95% in the public dataset and 17% to 94% in the hidden counterpart. Average consistency ranges from 21% to 61% and 22% to 63%, respectively. Domain-wise analysis shows that LLM performance significantly deteriorates when asked for specific numeric information while performing moderately with person, location, and date queries. Our dataset demonstrates its efficacy and serves as a comprehensive benchmark for LLM performance evaluation. Our dataset and LLMs responses are available at \href{https://github.com/ashikiut/DefAn}{https://github.com/ashikiut/DefAn}.">
|
|
<meta name=twitter:site content=@arxiv>
|
|
<meta name=twitter:card content=summary>
|
|
<meta name=twitter:title content="DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation">
|
|
<meta name=twitter:description content="Large Language Models (LLMs) have demonstrated remarkable capabilities, revolutionizing the integration of AI in daily life applications. However, they are prone to hallucinations, generating...">
|
|
<meta name=twitter:image content=https://static.arxiv.org/icons/twitter/arxiv-logo-twitter-square.png>
|
|
<meta name=twitter:image:alt content="arXiv logo">
|
|
<style media=screen>.visually-hidden{clip-path:inset(100%);clip:rect(1px,1px,1px,1px);height:1px;overflow:hidden;position:absolute;white-space:nowrap;width:1px}.button-and-tooltip{position:relative;display:inline}.button-and-tooltip button{background:transparent;border:none;box-shadow:none;position:relative;padding:0 0 0 6px}td.tablecell.arxivdoi{display:inline-flex;align-items:center}.arrow{width:0;height:0;border-left:8px solid transparent;border-right:8px solid transparent}@-webkit-keyframes pulsate{0%{-webkit-transform:scale(.1);transform:scale(.1);opacity:0}30%{opacity:1}60%{-webkit-transform:scale(.8);transform:scale(.8);opacity:0}}@keyframes pulsate{0%{-webkit-transform:scale(.1);transform:scale(.1);opacity:0}30%{opacity:1}60%{-webkit-transform:scale(.8);transform:scale(.8);opacity:0}}</style>
|
|
<meta name=citation_title content="DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation"><meta name=citation_author content="Rahman, A B M Ashikur"><meta name=citation_author content="Anwar, Saeed"><meta name=citation_author content="Usman, Muhammad"><meta name=citation_author content="Mian, Ajmal"><meta name=citation_date content=2024/06/13><meta name=citation_online_date content=2024/06/13><meta name=citation_pdf_url content=https://arxiv.org/pdf/2406.09155><meta name=citation_arxiv_id content=2406.09155><meta name=citation_abstract content="Large Language Models (LLMs) have demonstrated remarkable capabilities, revolutionizing the integration of AI in daily life applications. However, they are prone to hallucinations, generating claims that contradict established facts, deviating from prompts, and producing inconsistent responses when the same prompt is presented multiple times. Addressing these issues is challenging due to the lack of comprehensive and easily assessable benchmark datasets. Most existing datasets are small and rely on multiple-choice questions, which are inadequate for evaluating the generative prowess of LLMs. To measure hallucination in LLMs, this paper introduces a comprehensive benchmark dataset comprising over 75,000 prompts across eight domains. These prompts are designed to elicit definitive, concise, and informative answers. The dataset is divided into two segments: one publicly available for testing and assessing LLM performance and a hidden segment for benchmarking various LLMs. In our experiments, we tested six LLMs-GPT-3.5, LLama 2, LLama 3, Gemini, Mixtral, and Zephyr-revealing that overall factual hallucination ranges from 59% to 82% on the public dataset and 57% to 76% in the hidden benchmark. Prompt misalignment hallucination ranges from 6% to 95% in the public dataset and 17% to 94% in the hidden counterpart. Average consistency ranges from 21% to 61% and 22% to 63%, respectively. Domain-wise analysis shows that LLM performance significantly deteriorates when asked for specific numeric information while performing moderately with person, location, and date queries. Our dataset demonstrates its efficacy and serves as a comprehensive benchmark for LLM performance evaluation. Our dataset and LLMs responses are available at \href{https://github.com/ashikiut/DefAn}{https://github.com/ashikiut/DefAn}.">
|
|
<style>#MathJax_Message{position:fixed;left:1px;bottom:2px;background-color:#E6E6E6;border:1px solid #959595;margin:0px;padding:2px 8px;z-index:102;color:black;font-size:80%;width:auto;white-space:nowrap}</style><link rel=icon type=image/png sizes=32x32 href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAMAAABEpIrGAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAACr1BMVEUAAACzICWzICSyrKazrKWzqaKzraazR0mzTE6zaGizBQiz49uzZmSzEBazHySzsam0ICmzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzICWzrKWzrKWzICWzICWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzICWzrKWzrKWzrKWzICWzICWzICWzHySz4dWzrKWzrKWzrKWzICWzICWzHySzgX2zrqezrKWzrKWzICWzPD+zrKWzrKWzrKWzICWzHySzjYmzr6izraazT1CzLjKzMDSzQUOzrKWzrqezgH2zKi6zHySzICWzrKWzrKWzcW+zICWzICWzrKWzrKWzYWGzICWzrKWzUVKzICWzrKWzrKWznZezQkSzICWzICWzrKWzraazkY2zHiOzICWzICWzjYmzm5WzhIGzODuzAACzHySzHSKzOz6zrqazs6yzICWzICWzODuzrKWzrKWzrKWzICWzICWzODuzo5yzrKWzrKWzICWzICWzICWzGyCzr6ezrKWzrKWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzrKWzrKWzrKWzrKWzrKWzICWzICWzICWzrKWzrKWzrKWzrKWzICWzICWzICWzrKWzrKWzrKWzICWzICWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzrKWzICWzrKWzHySzkYyzrqezHiOzODuzJCmzqaOzISazraazpZ6zNTizkIuzi4ezraX///8A/VYjAAAA1HRSTlMAAAAAAAAAAAAAAAAAAAAAAAGA95kOB7X6kQ9W8fyaEQIBA3z9oRUllnUKDJz+qBgwxuwxGbquHD3SpQwt1CBL3cEeR+e7JFvm1mX0wSgEbO5HBYXFkfPyYaX++n0EHsH9lwtO8/NRJrr8/ttFGb7+3jpk/v6ecv6sK9j+/vVcQNf+8nUGSfD7dQMHivyVCmr2/vl3A0np28zvWS7V0joht+HJMR2vzCcLvygYp7IVc7QhFJ/9lQkSzhoRl/t2lu0UDY9XDSoLhvjgOwh89a4GB4L2i8A61lMAAAABYktHROQvYjspAAAAB3RJTUUH5wETDS455INCLAAAAgZJREFUOMt10/dfTXEYB/Dz6Ilst3lFueWmjDIysmdkF0Kyyd7Ze5WZcc2ErIhKISops6zs73Pda1zrH/EcoXPqe8+vn/fre57z+T5HURQwuHt4eoHi9AFvH2Fs6gu1nAJo1lz4+beAqkNc0BSAGhHY0ijMQa3+C8TgkNZttKJtOyFCw9r/FYgdOlKncC2Azl2EEF27/RGIEd2JevRE3Ri9erPw6cMCA/r2I+o/AFE/6MBBLCIHg6tpSBTR0HB9rophw1mMGDlqdDRRVEz1nMWYsX4sxsWOJ5owsWbOwjeOgXXSR1v85CmSnF8ydZpVCPun6TNm1pY2Wsdt1mwW1oQ5devJcsS58+YvUMXCRSC7Oly8hGjpZxZi2XKJQFyRSPRl5VcGYpVBBlYTkePbGvWItYGSd+C69arYsNEuNm2WDYG4ZauDyLbt+/YdO6X7Vb9BUjILx67dexpKe2i0d9/+HzYWKQcay3IwHDxkP/yT57AcOSq7Cjh2nIs+cfIXUeopyWVC2mn+vjNn08/xGecjaojKpbtwEVwvZbC4fKWagEx/zq9eA0DMyuZKc67rN65JrlGIvBtqP4g3b+VT4u07uqUtKBTmsKLK/hDvFt8jS4lJu/fepfc9H1T9OA8fxVPqY80YAE/KMjX9Y/nTZ/T8hWwz/4mXFa/o9RvnQMG3795/UKf4DUwCyzJ9eBcsAAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDIzLTAxLTE5VDEzOjQ2OjU3KzAwOjAwqJ4w4AAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyMy0wMS0xOVQxMzo0Njo1NyswMDowMNnDiFwAAABXelRYdFJhdyBwcm9maWxlIHR5cGUgaXB0YwAAeJzj8gwIcVYoKMpPy8xJ5VIAAyMLLmMLEyMTS5MUAxMgRIA0w2QDI7NUIMvY1MjEzMQcxAfLgEigSi4A6hcRdPJCNZUAAAAASUVORK5CYII="><style>.sf-hidden{display:none!important}</style><meta http-equiv=content-security-policy content="default-src 'none'; font-src 'self' data:; img-src 'self' data:; style-src 'unsafe-inline'; media-src 'self' data:; script-src 'unsafe-inline' data:; object-src 'self' data:; frame-src 'self' data:;"><style>img[src="data:,"],source[src="data:,"]{display:none!important}</style></head>
|
|
<body class=with-cu-identity><div id=MathJax_Message style=display:none></div>
|
|
|
|
|
|
<div class=flex-wrap-footer>
|
|
<header>
|
|
<a href=#content class=is-sr-only>Skip to main content</a>
|
|
|
|
<div class="columns is-vcentered is-hidden-mobile sf-hidden" id=cu-identity>
|
|
|
|
</div>
|
|
<div id=header class="is-hidden-mobile sf-hidden">
|
|
|
|
|
|
|
|
</div>
|
|
<div class=mobile-header>
|
|
<div class="columns is-mobile">
|
|
<div class="column logo-arxiv"><a href=https://arxiv.org/><img src="data:image/svg+xml;base64,PHN2ZyBpZD0ibG9nb21hcmsiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDc0LjQ5MiAxMDAuMjUiPjxnIGlkPSJ0aW55Xy1fd2hpdGUiIGRhdGEtbmFtZT0idGlueSAtIHdoaXRlIj48cGF0aCBkPSJNNTg2LjcyLDI1NS42MTZhMy4zNzcsMy4zNzcsMCwwLDEsLjQ0OC4wMzEsNS45MTcsNS45MTcsMCwwLDEsMy41ODEsMi43OWMuNDU0LDEuMTE2LjMxNCwyLjAyMy0xLjMxNSw0LjE0MUw1NjMuMTY4LDI5My42bC04LjU1OC0xMC4wNDcsMjkuMzQ4LTI2LjYxNmE0LjQwNiw0LjQwNiwwLDAsMSwyLjc2Mi0xLjMyMW0wLTEuNWE1Ljc2Niw1Ljc2NiwwLDAsMC0zLjY5LDEuNjQzbC0uMDQxLjAzMi0uMDM4LjAzNUw1NTMuNiwyODIuNDQybC0xLjA3Ny45NzcuOTQzLDEuMTA3LDguNTU4LDEwLjA0NywxLjE0NSwxLjM0NCwxLjE0MS0xLjM0OCwyNi4yNjctMzEuMDIyLjAyMi0uMDI3LjAyMi0uMDI4YzEuNTc0LTIuMDQ2LDIuMzI3LTMuNjIyLDEuNTE2LTUuNjE5YTcuMzA5LDcuMzA5LDAsMCwwLTQuNzc5LTMuNzE0LDUuMDgzLDUuMDgzLDAsMCwwLS42NC0uMDQzWiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTUyNi4wODYgLTI0NS41NTkpIiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTU1My40MjMsMjg0LjU5M2w4Ljk3NywxMC41NThMNTk3LjkxMSwzMzcuOWMuODczLDEuMDkzLDEuNDE5LDIuMTg2LDEuMDQ3LDMuNDE4YTQuMDkyLDQuMDkyLDAsMCwxLTIuNzIxLDIuODM3LDMuNTU3LDMuNTU3LDAsMCwxLTEuMDQ1LjE1OSw0LDQsMCwwLDEtMi42ODctMS4xMjRMNTQ4LjAxLDMwMC44MDhjLTMuNS0zLjUtMi45NzEtOC4xNTEuNDM2LTExLjU1OGw0Ljk3Ny00LjY1N20uMTI0LTIuMTdMNTUyLjQsMjgzLjVsLTQuOTc2LDQuNjU2Yy00LjE5Miw0LjE5MS00LjM3Miw5LjgxNi0uNDczLDEzLjcxNGw0NC41MjEsNDIuNGE1LjQ4NSw1LjQ4NSwwLDAsMCwzLjcyMiwxLjUzOCw1LjEsNS4xLDAsMCwwLDEuNDgzLS4yMjQsNS41OSw1LjU5LDAsMCwwLDMuNzE5LTMuODM4LDUuMTc2LDUuMTc2LDAsMCwwLTEuMzEtNC43ODhsLTM1LjUzLTQyLjc2Ny04Ljk4OC0xMC41NzEtMS4wMTktMS4yWiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTUyNi4wODYgLTI0NS41NTkpIiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTU2Mi40LDI5NS4xNTFsOS41NTYsMTEuNSw1Ljc2MS01LjM1NmE3LjkyNiw3LjkyNiwwLDAsMCwuMDQxLTExLjc0M2wtNDMuNy00MS45MjNzLTEuNjcxLTIuMDI5LTMuNDM3LTIuMDcxYTQuNDksNC40OSwwLDAsMC00LjIzLDIuNzE4Yy0uNjg4LDEuNjUxLS4xOTQsMi44MDksMS4zMTUsNC45N2wyOS4zMDYsMzUuNTY1WiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTUyNi4wODYgLTI0NS41NTkpIiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTU1My43LDMwNi4yMjNsLTE3LjExNiwyMS4wMjRjLTEuMjU1LDEuMzM3LTIuMDMyLDMuNjgzLTEuMzMxLDUuMzY3YTQuNTg3LDQuNTg3LDAsMCwwLDQuMjg3LDIuODQxLDQuMDg3LDQuMDg3LDAsMCwwLDMuMDgyLTEuNTIzbDIwLjMyOC0xOC45WiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTUyNi4wODYgLTI0NS41NTkpIiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTU5Mi4wNzQsMjUwLjU0NyIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTUyNi4wODYgLTI0NS41NTkpIiBmaWxsPSIjZmZmIiBzdHJva2U9IiMwMDAiIHN0cm9rZS1taXRlcmxpbWl0PSIxMCIgc3Ryb2tlLXdpZHRoPSIwLjI1Ii8+PC9nPjwvc3ZnPg==" alt="arXiv logo" style=height:60px></a></div>
|
|
<div class="column logo-cornell"><a href=https://www.cornell.edu/>
|
|
<picture>
|
|
|
|
|
|
<img src=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPCEtLSBHZW5lcmF0b3I6IEFkb2JlIElsbHVzdHJhdG9yIDE5LjIuMSwgU1ZHIEV4cG9ydCBQbHVnLUluIC4gU1ZHIFZlcnNpb246IDYuMDAgQnVpbGQgMCkgIC0tPgo8c3ZnIHZlcnNpb249IjEuMSIgaWQ9IkxheWVyXzEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIHg9IjBweCIgeT0iMHB4IgoJIHZpZXdCb3g9IjAgMCAxMjAgMTIwIiBzdHlsZT0iZW5hYmxlLWJhY2tncm91bmQ6bmV3IDAgMCAxMjAgMTIwOyIgeG1sOnNwYWNlPSJwcmVzZXJ2ZSI+CjxnPgoJPHBhdGggZD0iTTQ1LjksOTEuN2M2LjgsNS4zLDEzLjUsOC4yLDEzLjgsOC4zbDAuNSwwLjJsMC41LTAuMmMwLjMtMC4xLDYuOS0zLDEzLjctOC4zYzkuMS03LjEsMTMuOS0xNS4yLDEzLjktMjMuNWwwLTM3LjZsLTU2LjQsMAoJCWwwLDM3QzMxLjksNzYuMSwzNi43LDg0LjUsNDUuOSw5MS43eiBNNTguOCw4MC4xYy0wLjMtMC4xLTAuNi0wLjItMS0wLjJjMCwwLDAsMC0wLjEsMGMtMS40LDAtMi40LDAuMS0zLjUsMC4yCgkJYy0xLjEsMC4xLTIuMiwwLjItMy44LDAuMmMtMC43LDAtMS40LDAtMS45LDBWNjIuN2MwLjEsMCwwLjIsMCwwLjIsMGMwLjcsMCwxLjUsMCwyLjYsMGMwLjksMCwxLjktMC4xLDMtMC4yCgkJYzAuNi0wLjEsMS4zLTAuMSwyLTAuMmMxLjItMC4xLDIuMiwwLjEsMi4yLDAuMWMwLjEsMCwwLjIsMC4xLDAuMywwLjFDNTguOCw2Mi40LDU4LjgsODAuMSw1OC44LDgwLjF6IE04NS42LDY4LjIKCQljMCwxNS40LTE4LjksMjUuOC0yNC41LDI4LjZ2LTE0YzAuMS0wLjEsMC4yLTAuMiwwLjMtMC4zYzAuMy0wLjMsMC41LTAuNSwxLTAuNWMxLjMsMCwyLjMsMC4xLDMuNCwwLjJjMS4xLDAuMSwyLjIsMC4yLDMuOSwwLjIKCQljMS43LDAsMywwLDMsMGwwLjksMGwwLTUuMmgyLjJ2LTIuOGgtMi4ybDAtNS4xaDIuMnYtMi44aC0yLjJsMC01LjdsLTEsMGMtMC40LDAtMC44LDAtMS4zLDBjLTAuNywwLTEuNSwwLTIuNSwwCgkJYy0wLjgsMC0xLjctMC4xLTIuOC0wLjJjLTAuNi0wLjEtMS4zLTAuMS0yLTAuMmMtMS41LTAuMi0yLjgsMC4xLTIuOSwwLjFjMCwwLDAsMC0wLjEsMFY1NWgyNC41TDg1LjYsNjguMnogTTYxLjEsODAuMlY2Mi41CgkJYzAuMS0wLjEsMC4zLTAuMSwwLjUtMC4yYzAsMCwxLjEtMC4yLDIuMy0wLjFjMC43LDAuMSwxLjQsMC4xLDIsMC4yYzEuMSwwLjEsMi4xLDAuMiwzLDAuMmMwLjksMCwxLjYsMCwyLjMsMGMwLjIsMCwwLjQsMCwwLjUsMAoJCWwwLDE3LjdjLTAuNSwwLTEuMiwwLTEuOSwwYy0xLjYsMC0yLjctMC4xLTMuOC0wLjJjLTEuMS0wLjEtMi4xLTAuMi0zLjUtMC4yQzYxLjksODAsNjEuNSw4MC4xLDYxLjEsODAuMnogTTM0LjYsMzMuM2w1MSwwbDAsMTkuMQoJCWgtNTFMMzQuNiwzMy4zeiBNMzQuNiw1NWgyNC4zdjUuM2MtMC40LTAuMS0xLjUtMC4yLTIuNy0wLjFjLTAuNywwLjEtMS40LDAuMS0yLDAuMmMtMS4xLDAuMS0yLDAuMi0yLjgsMC4yYy0xLDAtMS44LDAtMi41LDAKCQljLTAuNSwwLTAuOSwwLTEuMywwbC0xLDB2NS4xaC0yLjR2Mi44aDIuNHY1LjFoLTIuNHYyLjhoMi40djUuN2wwLjksMGMwLjEsMCwxLjQsMC4xLDMsMGMxLjcsMCwyLjktMC4xLDMuOS0wLjIKCQljMS4xLTAuMSwyLTAuMiwzLjMtMC4yYzAuNSwwLDAuNywwLjIsMC45LDAuNWMwLDAsMC4xLDAuMSwwLjEsMC4xdjE0LjFDNTMsOTMuOCwzNC42LDgzLjUsMzQuNiw2Ny42TDM0LjYsNTV6Ii8+Cgk8cGF0aCBkPSJNNTUuOSwzNS4ySDQxLjZ2MWMwLDMuMSwwLDguNCwwLDguNmMwLjEsMS4xLDAuOCwyLjEsMi4yLDNjMS43LDEuMiw0LjMsMi43LDQuNCwyLjdsMC41LDAuM2wwLjUtMC4zCgkJYzAuMS0wLjEsMi45LTEuNiw0LjYtMi43YzItMS4zLDIuMS0yLjYsMi4xLTNjMC0wLjIsMC00LjgsMC04LjZWMzUuMnogTTQzLjYsMzcuMmgxMC4yYzAsMC44LDAsMS43LDAsMi42SDQzLjYKCQlDNDMuNiwzOC45LDQzLjYsMzgsNDMuNiwzNy4yeiBNNTMuOSw0NC43YzAsMC41LTAuNywxLTEuMiwxLjRjLTEuMiwwLjgtMy4xLDEuOS00LDIuNGMtMC44LTAuNS0yLjYtMS41LTMuOC0yLjQKCQljLTAuOC0wLjUtMS4zLTEuMS0xLjMtMS41YzAtMC4xLDAtMS4zLDAtMi45aDEwLjJDNTMuOSw0My4zLDUzLjksNDQuNSw1My45LDQ0Ljd6Ii8+Cgk8cGF0aCBkPSJNODAuMSwzNC44SDY1LjZ2MWMwLDMuOSwwLDguNiwwLDguOWMwLjEsMS4xLDAuOCwyLjEsMi4yLDNjMS44LDEuMiw0LjQsMi43LDQuNSwyLjhsMC41LDAuM2wwLjUtMC4zCgkJYzAuMS0wLjEsMy0xLjYsNC43LTIuOGMyLjEtMS40LDIuMS0yLjYsMi4xLTNjMC0wLjIsMC00LjksMC04LjhMODAuMSwzNC44eiBNNzguMSw0NC41YzAsMC41LTAuNywxLjEtMS4yLDEuNAoJCWMtMS4zLDAuOC0zLjIsMS45LTQuMSwyLjRjLTAuOS0wLjUtMi42LTEuNi0zLjktMi40Yy0wLjQtMC4zLTEuMy0wLjktMS4zLTEuNWMwLTAuMSwwLTAuMiwwLTAuNWw1LTMuNmw1LjUsMy43CgkJQzc4LjEsNDQuMyw3OC4xLDQ0LjUsNzguMSw0NC41eiBNNzIuNiwzNy44bC01LDMuNmMwLTEuNCwwLTMuMiwwLTQuN2gxMC42YzAsMS42LDAsMy4zLDAsNC44TDcyLjYsMzcuOHoiLz4KCTxwYXRoIGQ9Ik02My43LDY2LjVsMSwwLjFjMC42LDAuMSwxLjIsMC4xLDEuNywwLjFsMi45LDAuMWwwLTJsLTIuOS0wLjFjLTAuNCwwLTAuOS0wLjEtMS41LTAuMWwtMS0wLjFjLTAuMywwLTAuNSwwLTAuNywwbDAuMSwyCgkJQzYzLjQsNjYuNCw2My42LDY2LjQsNjMuNyw2Ni41eiIvPgoJPHBhdGggZD0iTTY2LjQsNzAuMmMtMC40LDAtMC45LTAuMS0xLjUtMC4xYy0wLjMsMC0wLjctMC4xLTEtMC4xYy0wLjMsMC0wLjUsMC0wLjcsMGwwLDJjMC4xLDAsMC4zLDAsMC41LDBjMC40LDAsMC43LDAuMSwxLDAuMQoJCWMwLjYsMC4xLDEuMSwwLjEsMS42LDAuMWMwLjYsMCwxLjUsMCwyLjIsMGwwLjgsMGwwLTJsLTAuOCwwQzY3LjksNzAuMyw2Nyw3MC4yLDY2LjQsNzAuMnoiLz4KCTxwYXRoIGQ9Ik02Ni40LDc1LjhjLTAuNCwwLTAuOS0wLjEtMS41LTAuMWMtMC4zLDAtMC43LTAuMS0xLTAuMWMtMC4zLDAtMC41LDAtMC43LDBsMC4xLDJjMC4xLDAsMC4zLDAsMC41LDBjMC40LDAsMC43LDAuMSwxLDAuMQoJCWMwLjYsMC4xLDEuMiwwLjEsMS42LDAuMWwwLjgsMGMwLjgsMCwxLjgsMCwyLjEsMC4xbDAtMmMtMC4zLDAtMS4zLDAtMi4xLTAuMUw2Ni40LDc1Ljh6Ii8+Cgk8cGF0aCBkPSJNNTMuOCw2Ni43YzAuNSwwLDEtMC4xLDEuNy0wLjFjMC4zLDAsMC42LTAuMSwxLTAuMWMwLjIsMCwwLjMsMCwwLjUsMGwwLTJjLTAuMiwwLTAuNCwwLTAuNywwYy0wLjQsMC0wLjcsMC4xLTEsMC4xCgkJYy0wLjYsMC4xLTEuMSwwLjEtMS41LDAuMWwtMi45LDAuMWwwLDJMNTMuOCw2Ni43eiIvPgoJPHBhdGggZD0iTTU1LjMsNzAuMWMtMC42LDAuMS0xLjEsMC4xLTEuNSwwLjFsLTIuOSwwLjFsMCwybDIuOS0wLjFjMC41LDAsMS0wLjEsMS43LTAuMWwwLjktMC4xYzAuMiwwLDAuMywwLDAuNCwwbDAuMS0yCgkJYy0wLjIsMC0wLjQsMC0wLjcsMEw1NS4zLDcwLjF6Ii8+Cgk8cGF0aCBkPSJNNTUuMyw3NS42Yy0wLjYsMC4xLTEuMSwwLjEtMS42LDAuMWMtMC45LDAtMi40LDAuMS0yLjgsMC4xbDAsMmMwLjQsMCwxLjktMC4xLDIuOS0wLjFjMC41LDAsMS0wLjEsMS43LTAuMWwxLTAuMQoJCWMwLjEsMCwwLjMsMCwwLjQsMGwwLTJjLTAuMiwwLTAuNCwwLTAuNywwTDU1LjMsNzUuNnoiLz4KCTxwYXRoIGQ9Ik05LjMsNjAuM2MyLjUsMCw0LTEuNiw0LjItNC40YzAtMC41LDAtMS4yLTAuMS0xLjlsMC0wLjJsLTIuMSwwbDAuMiwwLjRjMC4xLDAuMywwLjIsMSwwLjIsMS42Yy0wLjEsMS40LTEuMiwyLjMtMi43LDIuMgoJCWMtMS42LTAuMS0yLjUtMS4xLTIuNS0yLjVjMC0wLjUsMC4xLTEsMC41LTEuNWwwLjMtMC40bC0yLTAuM2wtMC4xLDAuMmMtMC4zLDAuNi0wLjQsMS4zLTAuNSwxLjljLTAuMiwyLjksMS40LDQuOCw0LjMsNC45CgkJQzkuMSw2MC4zLDkuMiw2MC4zLDkuMyw2MC4zeiIvPgoJPHBhdGggZD0iTTkuNyw1MC41YzAuNiwwLjIsMS4xLDAuMiwxLjYsMC4yYzEuOSwwLDMuMy0xLjEsMy45LTNjMC4zLTEuMiwwLjItMi4yLTAuMy0zLjJjLTAuNi0xLTEuNi0xLjctMi45LTIuMQoJCWMtMC41LTAuMi0xLjEtMC4yLTEuNS0wLjJjLTEuOSwwLTMuMywxLjEtMy45LDNDNS44LDQ3LjcsNyw0OS43LDkuNyw1MC41eiBNOC4zLDQ1LjhjMC4zLTEuMSwxLjItMS4zLDEuOS0xLjMKCQljMC4zLDAsMC43LDAuMSwxLjEsMC4yYzAuOSwwLjMsMS42LDAuNywxLjksMS4zYzAuMiwwLjQsMC4zLDAuOCwwLjEsMS4zYy0wLjMsMS4xLTEuMiwxLjMtMS45LDEuM2MtMC40LDAtMC43LTAuMS0xLjEtMC4yCgkJQzkuMSw0OCw3LjksNDcuMSw4LjMsNDUuOHoiLz4KCTxwYXRoIGQ9Ik0xOC4zLDM5LjNsLTIuOS0xLjZsMC4xLTAuMmMwLjMtMC41LDAuNS0wLjUsMC45LTAuNGwyLjksMC40bDEuMS0yLjFsLTMuOC0wLjRjLTAuMywwLTAuNSwwLTAuNywwYzAtMC44LTAuNC0xLjYtMS4yLTIKCQljLTAuNC0wLjItMC44LTAuMy0xLjItMC4zYy0xLjQsMC0yLjEsMS40LTIuOCwyLjdsLTEsMS45bDcuNiw0LjFMMTguMywzOS4zeiBNMTMuOCwzNi42bC0wLjEsMC4ybC0xLjQtMC43bDAuMS0wLjIKCQljMC4zLTAuNiwwLjctMS4yLDEuMy0wLjljMC4yLDAuMSwwLjMsMC4yLDAuNCwwLjRDMTQuMiwzNS44LDE0LDM2LjMsMTMuOCwzNi42eiIvPgoJPHBvbHlnb24gcG9pbnRzPSIyMywzMS43IDE5LjEsMjguNSAyNC43LDI5LjggMjYuMiwyNy45IDE5LjYsMjIuNCAxOC4yLDI0IDIyLjIsMjcuMyAxNi42LDI2IDE1LDI3LjkgMjEuNywzMy40IAkiLz4KCTxwb2x5Z29uIHBvaW50cz0iMzIuNywyMi40IDMxLjUsMjEgMjkuMSwyMi45IDI4LDIxLjYgMzAuMiwxOS44IDI5LDE4LjQgMjYuOCwyMC4yIDI2LDE5LjEgMjguNCwxNy4yIDI3LjIsMTUuNyAyMy4xLDE5IDI4LjUsMjUuNyAKCQkJIi8+Cgk8cG9seWdvbiBwb2ludHM9IjM5LjgsMTguMSAzOC45LDE2LjUgMzYuMiwxOCAzMi45LDEyLjEgMzEsMTMuMiAzNS4yLDIwLjcgCSIvPgoJPHBvbHlnb24gcG9pbnRzPSI0Ny4xLDE1LjMgNDYuNCwxMy42IDQzLjYsMTQuNiA0MS4zLDguMyAzOS4yLDkgNDIuMSwxNy4xIAkiLz4KCTxwYXRoIGQ9Ik01NS44LDEzLjljMC4yLDAsMC4zLDAsMC41LDBjMS4yLTAuMSwyLjEtMC41LDIuNi0xLjFjMC41LTAuNiwwLjctMS41LDAuNi0yLjVsLTAuNC01LjRMNTYuOSw1bDAuNCw0LjkKCQljMC4xLDEuNy0wLjYsMi0xLjIsMi4xYy0wLjEsMC0wLjEsMC0wLjIsMGMtMC41LDAtMS4yLTAuMi0xLjQtMS45bC0wLjQtNC45bC0yLjIsMC4ybDAuNCw1LjRDNTIuNywxMi44LDUzLjgsMTMuOSw1NS44LDEzLjl6Ii8+Cgk8cG9seWdvbiBwb2ludHM9IjY1LjcsOC44IDY3LjUsMTQuMSA3MCwxNC41IDcxLjEsNS45IDY5LDUuNiA2OC4zLDEwLjcgNjYuNSw1LjMgNjQuMSw1IDYyLjksMTMuNSA2NSwxMy44IAkiLz4KCTxwb2x5Z29uIHBvaW50cz0iNzcuNiw3LjggNzUuNSw3LjEgNzIuOSwxNS4zIDc1LDE2IAkiLz4KCTxwb2x5Z29uIHBvaW50cz0iODguNCwxMi42IDg2LjUsMTEuNiA4Mi4zLDE1LjkgODMuMiwxMCA4MS4xLDguOSA3OS44LDE4IDgyLDE5IAkiLz4KCTxwb2x5Z29uIHBvaW50cz0iOTAuNiwyNSA5MS43LDIzLjUgODkuMywyMS42IDkwLjMsMjAuMyA5Mi41LDIyIDkzLjcsMjAuNSA5MS41LDE4LjggOTIuMywxNy43IDk0LjcsMTkuNiA5NS45LDE4LjEgOTEuNywxNC45IAoJCTg2LjQsMjEuNyAJIi8+Cgk8cGF0aCBkPSJNOTQuMiwyOC42bDIuNS0yLjJsMC4xLDAuMmMwLjMsMC40LDAuMywwLjUsMC4xLDFsLTEuNCwyLjZsMS42LDEuOGwxLjctMy40YzAuMS0wLjEsMC4yLTAuNCwwLjItMC43CgkJYzAuNywwLjMsMS42LDAuMiwyLjMtMC40YzEuOS0xLjYsMC40LTMuMy0wLjctNC42bC0xLjQtMS42TDkyLjcsMjdMOTQuMiwyOC42eiBNOTkuMywyNC4xbDAuMSwwLjJjMC4zLDAuNCwwLjYsMC44LDAuNiwxLjEKCQljMCwwLjItMC4xLDAuMy0wLjMsMC40Qzk5LjYsMjYsOTkuNCwyNiw5OS4zLDI2Yy0wLjQsMC0wLjgtMC40LTEtMC43bC0wLjEtMC4yTDk5LjMsMjQuMXoiLz4KCTxwYXRoIGQ9Ik0xMDIuMiwzNS41Yy0wLjQsMC4zLTEtMC4yLTEuMi0wLjZjLTAuMy0wLjUtMC40LTEtMC40LTEuN2wwLTAuNWwtMiwwLjlsMC4xLDAuNGMwLjIsMC43LDAuMywxLjEsMC43LDEuOAoJCWMwLjcsMS4yLDEuNiwxLjgsMi42LDEuOGMwLjUsMCwwLjktMC4xLDEuNC0wLjRjMS41LTAuOSwxLjMtMi4zLDEuMS0zLjRjLTAuMS0wLjctMC4yLTEuMiwwLjEtMS40YzAuMi0wLjEsMC4zLTAuMSwwLjQtMC4xCgkJYzAuNCwwLDAuNywwLjQsMC44LDAuNWMwLjMsMC40LDAuNCwwLjksMC41LDEuNGwwLjEsMC40bDEuOC0wLjhsMC0wLjJjLTAuMS0wLjYtMC40LTEuMi0wLjgtMS45Yy0wLjktMS42LTIuNS0yLjEtMy45LTEuMgoJCWMtMS41LDAuOS0xLjMsMi4zLTEuMSwzLjNDMTAyLjUsMzQuOCwxMDIuNiwzNS4zLDEwMi4yLDM1LjV6Ii8+Cgk8cmVjdCB4PSIxMDIuNSIgeT0iMzkuNiIgdHJhbnNmb3JtPSJtYXRyaXgoMC45MjU5IC0wLjM3NzcgMC4zNzc3IDAuOTI1OSAtNy40NTU5IDQzLjM1NjgpIiB3aWR0aD0iOC42IiBoZWlnaHQ9IjIuMiIvPgoJPHBvbHlnb24gcG9pbnRzPSIxMTEuMiw0Ni4yIDEwNC43LDQ3LjggMTA1LjIsNDkuOSAxMTEuOCw0OC4zIDExMi4zLDUwLjUgMTE0LjEsNTAuMSAxMTIuNSw0My41IDExMC43LDQ0IAkiLz4KCTxwb2x5Z29uIHBvaW50cz0iMTA2LjIsNTYuNiAxMDYuMyw1OC44IDEwOS43LDU4LjYgMTE1LjEsNjEuNCAxMTUsNTkgMTExLjgsNTcuNSAxMTQuOCw1NS42IDExNC42LDUzIDEwOS42LDU2LjUgCSIvPgoJPHBvbHlnb24gcG9pbnRzPSIxMSw3Ni40IDEyLjgsNzUuOSAxMi4xLDczLjMgMTMuNCw3Mi45IDE0LjIsNzUuNyAxNiw3NS4xIDE0LjYsNzAuMyA2LjMsNzIuNyA2LjksNzQuOCAxMC4zLDczLjggCSIvPgoJPHBhdGggZD0iTTE4LjcsODMuNmMwLjMtMSwwLjItMi4xLTAuMy0zLjJjLTAuOC0xLjYtMi0yLjQtMy42LTIuNGMtMC43LDAtMS41LDAuMi0yLjMsMC42Yy0xLjMsMC42LTIuMSwxLjUtMi41LDIuNgoJCWMtMC4zLDEtMC4yLDIuMSwwLjMsMy4yYzAuOCwxLjUsMiwyLjQsMy42LDIuNGMwLjcsMCwxLjUtMC4yLDIuMy0wLjZDMTcuNSw4NS41LDE4LjQsODQuNywxOC43LDgzLjZ6IE0xNS4zLDg0LjEKCQljLTAuNSwwLjMtMS4xLDAuNC0xLjYsMC40Yy0wLjgsMC0xLjMtMC4zLTEuNi0xYy0wLjItMC41LTAuMy0wLjktMC4xLTEuM2MwLjItMC42LDAuOC0xLjIsMS42LTEuNmMwLjUtMC4yLDEtMC40LDEuNS0wLjQKCQljMC44LDAsMS4zLDAuMywxLjYsMWMwLjIsMC41LDAuMywwLjksMC4xLDEuM0MxNi42LDgzLjEsMTYuMSw4My43LDE1LjMsODQuMXoiLz4KCTxwYXRoIGQ9Ik0yNC43LDkwLjNsLTMuOSwzLjFjLTEsMC44LTEuOCwwLjktMi40LDAuMmMtMC4yLTAuMy0wLjMtMC42LTAuMy0wLjljMC4xLTAuNCwwLjQtMC45LDEtMS40bDMuOS0zLjFsLTEuNC0xLjdsLTQuMywzLjQKCQljLTAuOSwwLjctMS40LDEuNS0xLjQsMi40YzAsMC44LDAuMywxLjYsMSwyLjVjMC44LDEuMSwxLjcsMS42LDIuNywxLjZjMC43LDAsMS40LTAuMywyLjEtMC45TDI2LDkyTDI0LjcsOTAuM3oiLz4KCTxwb2x5Z29uIHBvaW50cz0iMzIsOTcuMyAyOC44LDEwMS4zIDMwLDk1LjcgMjguMSw5NC4xIDIyLjYsMTAwLjggMjQuMywxMDIuMiAyNy41LDk4LjIgMjYuMywxMDMuOCAyOC4yLDEwNS40IDMzLjcsOTguNyAJIi8+Cgk8cGF0aCBkPSJNMzguNCwxMDEuNGwtMi4yLTEuMWwtMy45LDcuN2wyLjIsMS4xYzEuMSwwLjUsMiwwLjgsMi45LDAuOGMxLjUsMCwyLjctMC44LDMuNS0yLjRDNDIuMiwxMDQuOSw0MS4zLDEwMi45LDM4LjQsMTAxLjR6CgkJIE0zOC44LDEwNi41Yy0wLjQsMC45LTEuMiwxLjQtMi4xLDEuNGMtMC40LDAtMC43LTAuMS0xLjEtMC4zbC0wLjUtMC4zbDIuMi00LjRsMC41LDAuM2MwLjYsMC4zLDEsMC44LDEuMiwxLjMKCQlDMzkuMiwxMDUuMiwzOS4xLDEwNS44LDM4LjgsMTA2LjV6Ii8+Cgk8cG9seWdvbiBwb2ludHM9IjQzLjIsMTEyLjggNDguMywxMTQuMiA0OC44LDExMi40IDQ1LjgsMTExLjYgNDYuMiwxMTAgNDksMTEwLjcgNDkuNSwxMDguOSA0Ni43LDEwOC4yIDQ3LDEwNi44IDUwLjEsMTA3LjYgCgkJNTAuNSwxMDUuOCA0NS40LDEwNC41IAkiLz4KCTxwYXRoIGQ9Ik01NS45LDEwNi41bC0yLjUtMC4xbC0wLjUsOC42bDIuNSwwLjFjMC4yLDAsMC40LDAsMC42LDBjMi45LDAsNC40LTEuNCw0LjYtNC4xYzAuMS0xLjMtMC4yLTIuNC0wLjktMy4yCgkJQzU4LjksMTA3LjEsNTcuNywxMDYuNiw1NS45LDEwNi41eiBNNTUuOCwxMTMuM2wtMC42LDBsMC4zLTQuOWwwLjYsMGMxLjIsMC4xLDIuMywxLDIuMiwyLjZDNTguMiwxMTIuNiw1NywxMTMuMyw1NS44LDExMy4zeiIvPgoJPHBhdGggZD0iTTY4LjIsMTA1LjhsLTEuNSw5LjFsMi4zLTAuNWwwLjMtMS45bDIuOC0wLjZsMS4xLDEuN2wyLjQtMC41bC01LjEtNy43TDY4LjIsMTA1Ljh6IE02OS42LDExMC42bDAuMy0yLjNsMS4yLDEuOQoJCUw2OS42LDExMC42eiIvPgoJPHBvbHlnb24gcG9pbnRzPSI3Ni40LDExMC4zIDc3LjIsMTEyLjQgNzkuMiwxMTEuNiA3OC41LDEwOS42IAkiLz4KCTxwYXRoIGQ9Ik04MC42LDEwMS42bC0yLjIsMS4ybDQuMSw3LjZsMi4yLTEuMmMyLjktMS42LDMuNi0zLjYsMi4zLTYuMUM4NS43LDEwMC41LDgzLjUsMTAwLDgwLjYsMTAxLjZ6IE04NC4xLDEwNy40bC0wLjUsMC4zCgkJbC0yLjMtNC4zbDAuNS0wLjNjMC4zLTAuMiwwLjctMC4zLDEuMS0wLjNjMC45LDAsMS42LDAuNSwyLjEsMS4zQzg1LjgsMTA1LjUsODUuMSwxMDYuOCw4NC4xLDEwNy40eiIvPgoJPHBvbHlnb24gcG9pbnRzPSI4OC43LDEwNC4zIDg5LjksMTA2LjEgOTEuNywxMDQuOCA5MC41LDEwMyAJIi8+Cgk8cG9seWdvbiBwb2ludHM9IjkyLjEsOTMuMiA5MS43LDk2LjYgOTMuNiw5Ni45IDkzLjgsOTUuNSA5Ny45LDk5LjYgOTkuNSw5OC4xIDkzLjQsOTEuOSAJIi8+Cgk8cGF0aCBkPSJNMTAzLjgsODguN2MtMC43LTAuNS0xLjUtMC42LTIuNC0wLjNjMC0wLjgtMC40LTEuNi0xLTJjLTEuMS0wLjgtMi41LTAuMy0zLjYsMS4xYy0xLDEuNC0wLjksMi44LDAuMywzLjcKCQljMC42LDAuNCwxLjQsMC41LDIuMSwwLjNjLTAuMSwwLjksMC4yLDEuNiwwLjksMi4xYzAuNSwwLjQsMSwwLjUsMS41LDAuNWMwLjksMCwxLjctMC41LDIuNS0xLjVjMC42LTAuOCwwLjktMS42LDAuOC0yLjQKCQlDMTA0LjgsODkuNywxMDQuNCw4OS4yLDEwMy44LDg4Ljd6IE05OS43LDg5LjNjLTAuMywwLjItMC42LDAuNC0wLjksMC40Yy0wLjEsMC0wLjMsMC0wLjQtMC4xYy0wLjQtMC4zLTAuMi0wLjgsMC0xLjEKCQljMC4xLTAuMiwwLjMtMC4zLDAuNS0wLjNjMC4xLDAsMC4zLDAuMSwwLjUsMC4yQzk5LjcsODguNiw5OS44LDg5LDk5LjcsODkuM3ogTTEwMi43LDkxLjdjLTAuMywwLjQtMC44LDAuNS0xLjIsMC4yCgkJYy0wLjMtMC4zLTAuNS0wLjYtMC40LTEuMmMwLjYtMC40LDEuMS0wLjUsMS41LTAuMmMwLjIsMC4xLDAuMywwLjMsMC4zLDAuNUMxMDIuOSw5MS4yLDEwMi44LDkxLjUsMTAyLjcsOTEuN3oiLz4KCTxwYXRoIGQ9Ik0xMDcuNyw4MC4zYy0xLjUtMC43LTIuOS0wLjMtMy42LDEuMWMtMC4yLDAuNC0wLjMsMC44LTAuMiwxLjJjLTAuNS0wLjQtMS0xLjEtMC42LTJjMC4yLTAuNCwwLjUtMC43LDAuOC0xbDAuMy0wLjIKCQlsLTEuNi0xLjFsLTAuMiwwLjJjLTAuNCwwLjQtMC44LDAuOS0xLjEsMS41Yy0wLjQsMC45LTAuNSwxLjctMC4yLDIuNWMwLjUsMS40LDEuOSwyLjQsMywyLjljMSwwLjUsMS44LDAuNywyLjUsMC43CgkJYzEuMSwwLDEuOS0wLjUsMi41LTEuNkMxMTAuMSw4Mi43LDEwOS41LDgxLjIsMTA3LjcsODAuM3ogTTEwNy43LDgzLjVjLTAuMSwwLjMtMC40LDAuNC0wLjcsMC40Yy0wLjIsMC0wLjUtMC4xLTAuNy0wLjIKCQljLTAuNC0wLjItMC43LTAuNS0wLjgtMC43Yy0wLjEtMC4yLDAtMC4zLDAtMC41YzAuMS0wLjMsMC4zLTAuNCwwLjctMC40YzAuMiwwLDAuNSwwLjEsMC44LDAuMmMwLjQsMC4yLDAuNiwwLjQsMC43LDAuNwoJCUMxMDcuOCw4My4yLDEwNy43LDgzLjQsMTA3LjcsODMuNXoiLz4KCTxwYXRoIGQ9Ik0xMTAuOCw3MS41Yy0xLjctMC41LTMuMSwwLjMtMy42LDEuOWMwLDAuMi0wLjEsMC4zLTAuMSwwLjRsLTEtMC4zbDAuOS0zbC0xLjctMC41bC0xLjUsNWw0LjgsMS41bDAtMC40CgkJYzAtMC41LDAuMS0wLjksMC4yLTEuNGMwLjEtMC40LDAuNi0xLjQsMS41LTEuMmMxLDAuMywwLjgsMS40LDAuNywxLjdjLTAuMiwwLjYtMC41LDEtMC44LDEuNGwtMC4zLDAuM2wyLDAuN2wwLjEtMC4yCgkJYzAuMy0wLjYsMC42LTEuMywwLjgtMS44YzAuMy0xLDAuMy0xLjktMC4xLTIuN0MxMTIuMyw3Mi4zLDExMS42LDcxLjgsMTEwLjgsNzEuNXoiLz4KCTxwYXRoIGQ9Ik0xNi4zLDYwYzAsMjQsMTkuNSw0My42LDQzLjUsNDMuNmMyNCwwLDQzLjUtMTkuNSw0My41LTQzLjZjMC0yNC0xOS41LTQzLjUtNDMuNS00My41QzM1LjksMTYuNCwxNi4zLDM2LDE2LjMsNjB6CgkJIE01OS45LDE3LjljMjMuMiwwLDQyLDE4LjksNDIsNDJjMCwyMy4yLTE4LjksNDItNDIsNDJjLTIzLjIsMC00Mi0xOC45LTQyLTQyQzE3LjgsMzYuOCwzNi43LDE3LjksNTkuOSwxNy45eiIvPgoJPHBhdGggZD0iTTU5LjksMC40QzI3LDAuNCwwLjMsMjcuMSwwLjMsNjBjMCwzMi44LDI2LjcsNTkuNiw1OS42LDU5LjZjMzIuOCwwLDU5LjYtMjYuNyw1OS42LTU5LjZDMTE5LjQsMjcuMSw5Mi43LDAuNCw1OS45LDAuNHoKCQkgTTU5LjksMTE4Yy0zMiwwLTU4LTI2LTU4LTU4YzAtMzIsMjYtNTgsNTgtNThjMzIsMCw1OCwyNiw1OCw1OEMxMTcuOSw5Miw5MS45LDExOCw1OS45LDExOHoiLz4KPC9nPgo8L3N2Zz4K alt="Cornell University Logo" srcset sizes>
|
|
</picture>
|
|
</a></div>
|
|
<div class="column nav" id=toggle-container role=menubar>
|
|
<button class=toggle-control><svg xmlns=http://www.w3.org/2000/svg viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"></path></svg></button>
|
|
<div class="mobile-toggle-block toggle-target sf-hidden">
|
|
|
|
</div>
|
|
<button class=toggle-control><svg xmlns=http://www.w3.org/2000/svg viewBox="0 0 448 512" class="icon filter-white" role=menu><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"></path></svg></button>
|
|
<div class="mobile-toggle-block toggle-target sf-hidden">
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</header>
|
|
<main>
|
|
<div id=content>
|
|
<div id=abs-outer>
|
|
<div class=leftcolumn>
|
|
<div class=subheader>
|
|
<h1>Computer Science > Computation and Language</h1>
|
|
</div>
|
|
<div class=header-breadcrumbs-mobile>
|
|
<strong>arXiv:2406.09155</strong> (cs)
|
|
</div>
|
|
<style>#abs{font-family:"Lucida Grande",Helvetica,Arial,sans-serif!important}#abs h1.title{display:block;font-size:1.8em!important;font-weight:700;margin-block-end:12px;margin-block-start:12px;margin-bottom:12px;margin-inline-end:0px;margin-inline-start:20px;margin-left:20px;margin-right:0px;margin-top:12px}#abs div.authors{font-size:1.2em;line-height:24px;margin-bottom:8px;margin-left:20px;margin-right:0px;margin-top:8px}#abs div.dateline{font-size:0.9em;font-style:italic;margin-bottom:6.5px;margin-left:20px;margin-right:0px;margin-top:6.5px}#abs blockquote.abstract{font-size:1.05em;margin-block-end:21.6px;margin-block-start:14.4px;margin-bottom:21.6px;margin-inline-end:40px;margin-inline-start:40px;background-color:white;border-left:0px;padding:0px}#abs div.metatable{font-size:0.95em!important;margin-bottom:19px;margin-left:20px;margin-right:0px;margin-top:0px;border:0px;padding:0px}#abs div.metatable tbody{vertical-align:middle}#abs tr{margin-top:0px;margin-bottom:0px}#abs td.tablecell{padding-top:0px;padding-bottom:0px;padding-right:6.5px;padding-left:0px;vertical-align:top;font-size:0.95em!important;margin-top:0px;margin-bottom:0px;border:0px}#abs td.tablecell.label{font-weight:400!important}#abs span.primary-subject{font-weight:700}#abs span.arxivid{font-weight:700}</style>
|
|
<div id=content-inner>
|
|
<div id=abs>
|
|
<div class=dateline>
|
|
[Submitted on 13 Jun 2024]</div>
|
|
<h1 class="title mathjax"><span class="descriptor sf-hidden">Title:</span>DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation</h1>
|
|
<div class=authors><span class="descriptor sf-hidden">Authors:</span><a href="https://arxiv.org/search/cs?searchtype=author&query=Rahman,+A+B+M+A" rel=nofollow>A B M Ashikur Rahman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Anwar,+S" rel=nofollow>Saeed Anwar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Usman,+M" rel=nofollow>Muhammad Usman</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Mian,+A" rel=nofollow>Ajmal Mian</a></div> <div id=download-button-info class=sf-hidden hidden>View a PDF of the paper titled DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation, by A B M Ashikur Rahman and 3 other authors</div>
|
|
<a class=mobile-submission-download href=https://arxiv.org/pdf/2406.09155>View PDF</a>
|
|
<a class=mobile-submission-download href=https://arxiv.org/html/2406.09155v1>HTML (experimental)</a>
|
|
<blockquote class="abstract mathjax">
|
|
<span class="descriptor sf-hidden">Abstract:</span>Large Language Models (LLMs) have demonstrated remarkable capabilities, revolutionizing the integration of AI in daily life applications. However, they are prone to hallucinations, generating claims that contradict established facts, deviating from prompts, and producing inconsistent responses when the same prompt is presented multiple times. Addressing these issues is challenging due to the lack of comprehensive and easily assessable benchmark datasets. Most existing datasets are small and rely on multiple-choice questions, which are inadequate for evaluating the generative prowess of LLMs. To measure hallucination in LLMs, this paper introduces a comprehensive benchmark dataset comprising over 75,000 prompts across eight domains. These prompts are designed to elicit definitive, concise, and informative answers. The dataset is divided into two segments: one publicly available for testing and assessing LLM performance and a hidden segment for benchmarking various LLMs. In our experiments, we tested six LLMs-GPT-3.5, LLama 2, LLama 3, Gemini, Mixtral, and Zephyr-revealing that overall factual hallucination ranges from 59% to 82% on the public dataset and 57% to 76% in the hidden benchmark. Prompt misalignment hallucination ranges from 6% to 95% in the public dataset and 17% to 94% in the hidden counterpart. Average consistency ranges from 21% to 61% and 22% to 63%, respectively. Domain-wise analysis shows that LLM performance significantly deteriorates when asked for specific numeric information while performing moderately with person, location, and date queries. Our dataset demonstrates its efficacy and serves as a comprehensive benchmark for LLM performance evaluation. Our dataset and LLMs responses are available at \href{<a href=https://github.com/ashikiut/DefAn rel="external noopener nofollow" class="link-external link-https">this https URL</a>}{<a href=https://github.com/ashikiut/DefAn rel="external noopener nofollow" class="link-external link-https">this https URL</a>}.
|
|
</blockquote>
|
|
|
|
<div class=metatable>
|
|
<table summary="Additional metadata"><tbody><tr>
|
|
<td class="tablecell label">Subjects:</td>
|
|
<td class="tablecell subjects">
|
|
<span class=primary-subject>Computation and Language (cs.CL)</span>; Artificial Intelligence (cs.AI); Computer Vision and Pattern Recognition (cs.CV); Machine Learning (cs.LG)</td>
|
|
<tr>
|
|
<td class="tablecell label">Cite as:</td>
|
|
<td class="tablecell arxivid"><span class=arxivid><a href=https://arxiv.org/abs/2406.09155>arXiv:2406.09155</a> [cs.CL]</span></td>
|
|
</tr>
|
|
<tr>
|
|
<td class="tablecell label"> </td>
|
|
<td class="tablecell arxividv">(or <span class=arxivid>
|
|
<a href=https://arxiv.org/abs/2406.09155v1>arXiv:2406.09155v1</a> [cs.CL]</span> for this version)
|
|
</td>
|
|
</tr>
|
|
<tr>
|
|
<td class="tablecell label"> </td>
|
|
<td class="tablecell arxivdoi"> <a href=https://doi.org/10.48550/arXiv.2406.09155 id=arxiv-doi-link>https://doi.org/10.48550/arXiv.2406.09155</a><div class=button-and-tooltip>
|
|
<button class=more-info aria-describedby=more-info-desc-1>
|
|
<svg height=15 role=presentation xmlns=http://www.w3.org/2000/svg viewBox="0 0 512 512"><path fill=currentColor d="M256 8C119.043 8 8 119.083 8 256c0 136.997 111.043 248 248 248s248-111.003 248-248C504 119.083 392.957 8 256 8zm0 110c23.196 0 42 18.804 42 42s-18.804 42-42 42-42-18.804-42-42 18.804-42 42-42zm56 254c0 6.627-5.373 12-12 12h-88c-6.627 0-12-5.373-12-12v-24c0-6.627 5.373-12 12-12h12v-64h-12c-6.627 0-12-5.373-12-12v-24c0-6.627 5.373-12 12-12h64c6.627 0 12 5.373 12 12v100h12c6.627 0 12 5.373 12 12v24z"></path></svg>
|
|
<span class=visually-hidden>Focus to learn more</span>
|
|
</button>
|
|
|
|
<div role=tooltip id=more-info-desc-1 class=sf-hidden>
|
|
arXiv-issued DOI via DataCite</div>
|
|
</div>
|
|
</td>
|
|
</table>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class=submission-history>
|
|
<h2>Submission history</h2> From: Saeed Anwar [<a href=https://arxiv.org/show-email/2ead08a5/2406.09155 rel=nofollow>view email</a>] <br> <strong>[v1]</strong>
|
|
Thu, 13 Jun 2024 14:18:13 UTC (2,046 KB)<br>
|
|
</div>
|
|
</div>
|
|
|
|
<div class=extra-services> <div class=full-text>
|
|
<a name=other></a>
|
|
<span class="descriptor sf-hidden">Full-text links:</span>
|
|
<h2>Access Paper:</h2>
|
|
<ul>
|
|
<div id=download-button-info class=sf-hidden hidden>
|
|
View a PDF of the paper titled DefAn: Definitive Answer Dataset for LLMs Hallucination Evaluation, by A B M Ashikur Rahman and 3 other authors</div><li><a href=https://arxiv.org/pdf/2406.09155 aria-describedby=download-button-info accesskey=f class="abs-button download-pdf">View PDF</a><li><a href=https://arxiv.org/html/2406.09155v1 class=abs-button id=latexml-download-link>HTML (experimental)</a><li><a href=https://arxiv.org/src/2406.09155 class="abs-button download-eprint">TeX Source
|
|
</a></ul>
|
|
<div class=abs-license><a href=http://creativecommons.org/licenses/by/4.0/ title="Rights to this article" class=has_license>
|
|
<img alt="license icon" role=presentation src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFAAAAAPCAIAAAD8q9/YAAAABGdBTUEAANbY1E9YMgAAABl0RVh0U29mdHdhcmUAQWRvYmUgSW1hZ2VSZWFkeXHJZTwAAAISSURBVHjaYmAYYYARiP///z9SfMvIyAJhrdm0Glliz669M6fO7OjoKC8vx9Tm6up66tSptu5WcQlxIFdeVsHVyfXjh4+D3LeQeGXClHj54uXyxctx+RYIdu/ebWZmNrl/CoT74eOHwoLCoRLJWDy8fMkKYWFhXL6FAGBw3Lxxc8PajUD2x08fPHw8vH28h6qHL128lJ6eDmScPXtWWVkZmO6Bafj9+/dAEsgGigDFjY2NXVxcrl29BjWFiQnIRUs/EICcnDDZA+/hSxcvf/70GegfIDssLAzIALoPSHZ2dgL9+e7dO2DM37t3DygLFLxz+w48VQMTOWYJAQQQ78EZQBLIHkQe/vrlC5AUFBQERinQYxCfAxMwkK2kpAQUT0tLCw0NhRXxUKf//PmDkYURs5BA9h7EzwPrWywe5ubhAZJA3wL9BvQhMFYhxTKQC/QzULyiomLWrFloiZOdneP/n/94Ynjw5mE9fV1eXl6IP4ERu2fPHqCjgT4HsoGxLSQkBBSBZNczZ84oKitCdAnwCwArKvwxPLgaHsj1cF9X//Onz+/fv49HGzBETExMouOiA4L9gVxZabnmhuatW7YO8noYGPpYSunouKi3b98AoxSP5rKyMhVVZYhv+fkEdmzZMch9i69aArafUjNTKysr29vbsepxcna6fv16QUkBPD33T+gfbUsP0rb0SOssMQAEGAB0zfzj5yHE/QAAAABJRU5ErkJggg==">
|
|
<span>view license</span>
|
|
</a></div>
|
|
</div>
|
|
<div class=browse>
|
|
Current browse context: <div class=current>cs.CL</div>
|
|
<div class=prevnext>
|
|
<span class=arrow>
|
|
<a class="abs-button prev-url" href="https://arxiv.org/prevnext?id=2406.09155&function=prev&context=cs.CL" accesskey=p title="previous in cs.CL (accesskey p)" rel=nofollow>< prev</a>
|
|
</span>
|
|
<span class="is-hidden-mobile sf-hidden"> | </span> <span class=arrow>
|
|
<a class="abs-button next-url" href="https://arxiv.org/prevnext?id=2406.09155&function=next&context=cs.CL" accesskey=n title="next in cs.CL (accesskey n)" rel=nofollow>next ></a>
|
|
</span><br>
|
|
</div><div class=list>
|
|
<a class="abs-button abs-button-grey abs-button-small context-new" href=https://arxiv.org/list/cs.CL/new rel=nofollow>new</a>
|
|
<span class="is-hidden-mobile sf-hidden"> | </span>
|
|
<a class="abs-button abs-button-grey abs-button-small context-recent" href=https://arxiv.org/list/cs.CL/recent rel=nofollow>recent</a>
|
|
<span class="is-hidden-mobile sf-hidden"> | </span><a class="abs-button abs-button-grey abs-button-small context-id" href=https://arxiv.org/list/cs.CL/2024-06 rel=nofollow>2024-06</a>
|
|
</div><div class=abs-switch-cat>
|
|
Change to browse by:
|
|
<div class="switch context-change">
|
|
<a href="https://arxiv.org/abs/2406.09155?context=cs" rel=nofollow>cs</a><br class="is-hidden-mobile sf-hidden">
|
|
<a class=subclass href="https://arxiv.org/abs/2406.09155?context=cs.AI" rel=nofollow>cs.AI</a><br class="is-hidden-mobile sf-hidden">
|
|
<a class=subclass href="https://arxiv.org/abs/2406.09155?context=cs.CV" rel=nofollow>cs.CV</a><br class="is-hidden-mobile sf-hidden">
|
|
<a class=subclass href="https://arxiv.org/abs/2406.09155?context=cs.LG" rel=nofollow>cs.LG</a><br class="is-hidden-mobile sf-hidden">
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class=extra-ref-cite>
|
|
<h3>References & Citations</h3>
|
|
<ul>
|
|
<li><a class="abs-button abs-button-small cite-ads" href=https://ui.adsabs.harvard.edu/abs/arXiv:2406.09155>NASA ADS</a><li><a class="abs-button abs-button-small cite-google-scholar" href="https://scholar.google.com/scholar_lookup?arxiv_id=2406.09155" target=_blank rel=noopener>Google Scholar</a></li>
|
|
<li><a class="abs-button abs-button-small cite-semantic-scholar" href=https://api.semanticscholar.org/arXiv:2406.09155 target=_blank rel=noopener>Semantic Scholar</a></li>
|
|
</ul>
|
|
<div style=clear:both></div>
|
|
</div>
|
|
<div class=extra-ref-cite>
|
|
<span id=bib-cite-trigger class="bib-cite-button abs-button">export BibTeX citation</span>
|
|
<span id=bib-cite-loading class=sf-hidden hidden>Loading...</span>
|
|
</div>
|
|
<div id=bib-cite-modal class="bib-modal sf-hidden" hidden>
|
|
|
|
</div><div class=bookmarks>
|
|
<div><h3>Bookmark</h3></div><a class="abs-button abs-button-grey abs-button-small" href="http://www.bibsonomy.org/BibtexHandler?requTask=upload&url=https://arxiv.org/abs/2406.09155&description=DefAn:%20Definitive%20Answer%20Dataset%20for%20LLMs%20Hallucination%20Evaluation" title="Bookmark on BibSonomy">
|
|
<img src=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAABTVBMVEXn5ubf3t6/H2OioaKgn6DLysrHxsa4tre3traysLGxsLCqqKmmpKWioKGenJ2cmpuamJmYlpeWlJWVlJSUkpOQjo+Ib3qMiov+/v78/Pz6+vr5+Pn4+Pj29vb09PSrqal1cnTu7u7i4uLe3t7V1NWJh4fJyMnHxsfDwsO/vr+9vL3y8fG3treDXW3e3d2oqKilpKWkoqSjoqPY19fS0dGXlpfMy8uUkpTGxcXEw8ONjI28u7u4t7ezsbKxr7CjH1etq6yrqaqnpaampaWko6OjoaKioaGhn6Cgn5+fnZ6em52dm5ybmZqVk5SIWW2Rj5CPjY6Oi42Ni4z////9/f37+/t/fX75+floSlfv7+/p6eno5+jl5eXk4+Tj4+Pd3d3b29vZ2dnX19fT09PS0dLR0dGEa3XOzc7Nzc3KycrIx8i+vb53dHW6ubrr6up9Unu9AAAA0klEQVQYlWMIRgJRgeJsDMFSxnEwAYcsRXOGYBcLp5TgYMlsu2RPW2ZVBoZgfS1hs+BgS83oMLlgZbtghuDQhCSRmOBYEXet4OBU1xAGkF5tAePgBNvgYBU+LgewQLCMm5oee7CsEDcrL0QgONQgTZeHL9E4QwYqEJxrzyQmAWJABELVfSP8vIJhAiEqgmo6WSGO6WABE3kOb+ssIwXZ4FABRpBAjjSLXrCoJGdicLCsfx5QwNmQPzw408NABigbGaQRD3QpknetlEwZgpGBj00AAMhwRAoMDs/uAAAAAElFTkSuQmCC alt="BibSonomy logo">
|
|
</a>
|
|
<a class="abs-button abs-button-grey abs-button-small" href="https://reddit.com/submit?url=https://arxiv.org/abs/2406.09155&title=DefAn:%20Definitive%20Answer%20Dataset%20for%20LLMs%20Hallucination%20Evaluation" title="Bookmark on Reddit">
|
|
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAASCAMAAABhEH5lAAAAclBMVEWtra3/IQClpaX/hFJ7e3tzc3Nra2tjY2P/Yyn/597/zr05OTkxMTHe3t4pKSnW1tbOzs7Gxsb/vaX/GACcnJyUlJSMjIyEhIT/rYxaWlpSUlL///9KSkr39/dCQkL/jFrv7+//hFrn5+f/jGO9vb21tbWaFPpZAAAAxUlEQVQYlU2Q7VrDMAhGEcMyO6thNsH0ta5m9f5vUVxbJz/ycQLkPBD2aMY91TCD/lCRQU5zoDuapAGSo97Rq/gys2JF05KtjpbT7ZFakHAQSq3pIGIxT2SD1H6vXj65MAXzJufrE9B9dWjRMi0vnnN8eAce3y4An3pCxJ517QAt/qMk/Is8ONLaMGVt7Zvdzi/uNVSFmgv04DGtqilW1pQ0f4RfnZt9HNmKZX4u2BCPee0dDsuGjMs6DKOyou3kcwhqvv0AYpYbpE15FCsAAAAASUVORK5CYII=" alt="Reddit logo">
|
|
</a>
|
|
</div> </div>
|
|
|
|
<div id=labstabs>
|
|
<div class=labstabs><input type=radio name=tabs id=tabone checked class=sf-hidden>
|
|
<label for=tabone>Bibliographic Tools</label>
|
|
<div class="tab labs-display-bib">
|
|
<h1>Bibliographic and Citation Tools</h1>
|
|
<div class=toggle>
|
|
<div class="columns is-mobile lab-row">
|
|
<div class="column lab-switch">
|
|
<label class=switch>
|
|
<input id=bibex-toggle type=checkbox class="lab-toggle sf-hidden" data-script-url=/static/browse/0.3.4/bibex/bibex.js?20241202>
|
|
<span class=slider></span>
|
|
<span class=is-sr-only>Bibliographic Explorer Toggle</span>
|
|
</label>
|
|
</div>
|
|
<div class="column lab-name">
|
|
<span id=label-for-bibex>Bibliographic Explorer</span> <em>(<a href=https://info.arxiv.org/labs/showcase.html#arxiv-bibliographic-explorer>What is the Explorer?</a>)</em>
|
|
</div>
|
|
</div>
|
|
<div class="columns is-mobile lab-row">
|
|
<div class="column lab-switch">
|
|
<label class=switch>
|
|
<input id=connectedpapers-toggle type=checkbox class="lab-toggle sf-hidden" data-script-url=/static/browse/0.3.4/js/connectedpapers.js aria-labelledby=label-for-connected-papers>
|
|
<span class=slider></span>
|
|
<span class=is-sr-only>Connected Papers Toggle</span>
|
|
</label>
|
|
</div>
|
|
<div class="column lab-name">
|
|
<span id=label-for-connected-papers>Connected Papers</span> <em>(<a href=https://www.connectedpapers.com/about target=_blank>What is Connected Papers?</a>)</em>
|
|
</div>
|
|
</div><div class="columns is-mobile lab-row">
|
|
<div class="column lab-switch">
|
|
<label class=switch>
|
|
<input id=litmaps-toggle type=checkbox class="lab-toggle sf-hidden" data-script-url=/static/browse/0.3.4/js/litmaps.js?20210617 aria-labelledby=label-for-litmaps>
|
|
<span class=slider></span>
|
|
<span class=is-sr-only>Litmaps Toggle</span>
|
|
</label>
|
|
</div>
|
|
<div class="column lab-name">
|
|
<span id=label-for-litmaps>Litmaps</span> <em>(<a href=https://www.litmaps.co/ target=_blank>What is Litmaps?</a>)</em>
|
|
</div>
|
|
</div>
|
|
<div class="columns is-mobile lab-row">
|
|
<div class="column lab-switch">
|
|
<label class=switch>
|
|
<input id=scite-toggle type=checkbox class="lab-toggle sf-hidden" data-script-url=/static/browse/0.3.4/js/scite.js?20210617 aria-labelledby=label-for-scite>
|
|
<span class=slider></span>
|
|
<span class=is-sr-only>scite.ai Toggle</span>
|
|
</label>
|
|
</div>
|
|
<div class="column lab-name">
|
|
<span id=label-for-scite>scite Smart Citations</span> <em>(<a href=https://www.scite.ai/ target=_blank>What are Smart Citations?</a>)</em>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="labs-content-placeholder labs-display" style=display:none></div>
|
|
<div style=min-height:15px id=connectedpapers-output></div>
|
|
<div style=min-height:15px id=litmaps-open-in></div>
|
|
<div style=min-height:15px id=scite-open-in></div>
|
|
</div>
|
|
<input type=radio name=tabs id=tabtwo class=sf-hidden>
|
|
<label for=tabtwo>Code, Data, Media</label>
|
|
<div class="tab sf-hidden">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
<input type=radio name=tabs id=labstabs-demos-input class=sf-hidden>
|
|
<label for=labstabs-demos-input id=labstabs-demos-label>Demos</label>
|
|
<div class="tab sf-hidden">
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
<input type=radio name=tabs id=tabfour class=sf-hidden>
|
|
<label for=tabfour>Related Papers</label>
|
|
<div class="tab sf-hidden">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
<input type=radio name=tabs id=tabfive class=sf-hidden>
|
|
<label for=tabfive>
|
|
About arXivLabs
|
|
</label>
|
|
<div class="tab sf-hidden">
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class=endorsers>
|
|
<a href=https://arxiv.org/auth/show-endorsers/2406.09155 class=endorser-who rel=nofollow>Which authors of this paper are endorsers?</a> |
|
|
<a id=mathjax_toggle>Disable MathJax</a> (<a href=https://info.arxiv.org/help/mathjax.html>What is MathJax?</a>)
|
|
<span class="help sf-hidden" style=font-style:normal;float:right;margin-top:0;margin-right:1em></span>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</main>
|
|
<footer style=clear:both>
|
|
<div class="columns is-desktop" role=navigation aria-label=Secondary style="margin:-0.75em -0.75em 0.75em -0.75em">
|
|
|
|
<div class=column style=padding:0>
|
|
<div class=columns>
|
|
<div class=column>
|
|
<ul style=list-style:none;line-height:2>
|
|
<li><a href=https://info.arxiv.org/about>About</a></li>
|
|
<li><a href=https://info.arxiv.org/help>Help</a></li>
|
|
</ul>
|
|
</div>
|
|
<div class=column>
|
|
<ul style=list-style:none;line-height:2>
|
|
<li>
|
|
<svg xmlns=http://www.w3.org/2000/svg viewBox="0 0 512 512" class="icon filter-black" role=presentation><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"></path></svg>
|
|
<a href=https://info.arxiv.org/help/contact.html> Contact</a>
|
|
</li>
|
|
<li>
|
|
<svg xmlns=http://www.w3.org/2000/svg viewBox="0 0 512 512" class="icon filter-black" role=presentation><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"></path></svg>
|
|
<a href=https://info.arxiv.org/help/subscribe> Subscribe</a>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<div class=column style=padding:0>
|
|
<div class=columns>
|
|
<div class=column>
|
|
<ul style=list-style:none;line-height:2>
|
|
<li><a href=https://info.arxiv.org/help/license/index.html>Copyright</a></li>
|
|
<li><a href=https://info.arxiv.org/help/policies/privacy_policy.html>Privacy Policy</a></li>
|
|
</ul>
|
|
</div>
|
|
<div class="column sorry-app-links">
|
|
<ul style=list-style:none;line-height:2>
|
|
<li><a href=https://info.arxiv.org/help/web_accessibility.html>Web Accessibility Assistance</a></li>
|
|
<li>
|
|
<p class=help>
|
|
<a class=a11y-main-link href=https://status.arxiv.org/ target=_blank>arXiv Operational Status <svg xmlns=http://www.w3.org/2000/svg viewBox="0 0 256 512" class="icon filter-dark_grey" role=presentation><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"></path></svg></a><br>
|
|
</p>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</footer>
|
|
</div>
|
|
|