{
  "format_version": 3,
  "claim_formal": {
    "subject": "AI language models (as a general class)",
    "property": "hallucination rate on factual question benchmarks",
    "operator": ">=",
    "operator_note": "To DISPROVE the claim that hallucinations occur on fewer than 5% of factual questions, we need >= 3 independent, verified sources showing hallucination rates >= 5% on factual question benchmarks. The claim is universal ('AI hallucinations') without specifying a particular model, so any major AI model demonstrating >= 5% hallucination on factual questions constitutes a counterexample. Even if some models on some narrow benchmarks achieve < 5%, the general claim is disproved if the typical or average rate exceeds 5%. Note: summarization benchmarks (Vectara original) measure grounded factual consistency with provided text, not open-ended factual question answering \u2014 we focus on open-ended factual QA benchmarks like SimpleQA, PersonQA, and AA-Omniscience.",
    "threshold": 3,
    "proof_direction": "disprove"
  },
  "claim_natural": "AI hallucinations occur on fewer than 5% of factual questions",
  "evidence": {
    "B1": {
      "type": "empirical",
      "label": "IEEE ComSoc: OpenAI o3 hallucinated 33% on PersonQA",
      "sub_claim": null,
      "source": {
        "name": "IEEE Communications Society Technology Blog",
        "url": "https://techblog.comsoc.org/2025/05/10/nyt-ai-is-getting-smarter-but-hallucinations-are-getting-worse/",
        "quote": "The company found that o3 \u2014 its most powerful system \u2014 hallucinated 33% of the time when running its PersonQA benchmark test"
      },
      "verification": {
        "status": "verified",
        "method": "full_quote",
        "coverage_pct": null,
        "fetch_mode": "live",
        "credibility": {
          "domain": "comsoc.org",
          "source_type": "unknown",
          "tier": 2,
          "flags": [],
          "note": "Unclassified domain \u2014 verify source authority manually"
        }
      },
      "extraction": {
        "value": "verified",
        "value_in_quote": true,
        "quote_snippet": "The company found that o3 \u2014 its most powerful system \u2014 hallucinated 33% of the t"
      }
    },
    "B2": {
      "type": "empirical",
      "label": "AllAboutAI: ChatGPT hallucinates in ~19.5% of responses",
      "sub_claim": null,
      "source": {
        "name": "AllAboutAI LLM Hallucination Test",
        "url": "https://www.allaboutai.com/resources/llm-hallucination/",
        "quote": "ChatGPT generates hallucinated content in approximately 19.5% of its responses"
      },
      "verification": {
        "status": "verified",
        "method": "full_quote",
        "coverage_pct": null,
        "fetch_mode": "live",
        "credibility": {
          "domain": "allaboutai.com",
          "source_type": "unknown",
          "tier": 2,
          "flags": [],
          "note": "Unclassified domain \u2014 verify source authority manually"
        }
      },
      "extraction": {
        "value": "verified",
        "value_in_quote": true,
        "quote_snippet": "ChatGPT generates hallucinated content in approximately 19.5% of its responses"
      }
    },
    "B3": {
      "type": "empirical",
      "label": "Artificial Analysis: best model 22% hallucination on AA-Omniscience",
      "sub_claim": null,
      "source": {
        "name": "Artificial Analysis AA-Omniscience Benchmark",
        "url": "https://artificialanalysis.ai/evaluations/omniscience",
        "quote": "Grok 4.20 Beta 0309 (Reasoning)"
      },
      "verification": {
        "status": "verified",
        "method": "full_quote",
        "coverage_pct": null,
        "fetch_mode": "live",
        "credibility": {
          "domain": "artificialanalysis.ai",
          "source_type": "unknown",
          "tier": 2,
          "flags": [],
          "note": "Unclassified domain \u2014 verify source authority manually"
        }
      },
      "extraction": {
        "value": "verified",
        "value_in_quote": true,
        "quote_snippet": "Grok 4.20 Beta 0309 (Reasoning)"
      }
    },
    "A1": {
      "type": "computed",
      "label": "Verified source count meets disproof threshold",
      "sub_claim": null,
      "method": "count(verified citations) = 3",
      "result": "3",
      "depends_on": []
    }
  },
  "cross_checks": [
    {
      "description": "Multiple independent sources consulted across different benchmarks",
      "n_sources_consulted": 3,
      "n_sources_verified": 3,
      "sources": {
        "source_ieee": "verified",
        "source_allaboutai": "verified",
        "source_aa": "verified"
      },
      "independence_note": "Sources are from different publications (IEEE ComSoc, AllAboutAI, Artificial Analysis) reporting on different benchmarks and models (PersonQA, ChatGPT testing, AA-Omniscience). Each measures hallucination rates independently.",
      "fact_ids": []
    }
  ],
  "adversarial_checks": [
    {
      "question": "Are there any major AI models that achieve < 5% hallucination on open-ended factual QA?",
      "verification_performed": "Searched for 'AI model lowest hallucination rate factual questions 2025 2026'. Found that on Vectara's ORIGINAL summarization benchmark, some models achieve < 1% (Gemini-2.0-Flash at 0.7%). However, this measures grounded summarization (factual consistency with provided text), NOT open-ended factual question answering. On the Vectara NEW dataset (harder, more realistic), most frontier models exceed 10%. On AA-Omniscience (6,000 factual questions), the best model has 22% hallucination.",
      "finding": "Low hallucination rates (< 5%) exist only on narrow grounded summarization tasks, not on open-ended factual question benchmarks. The claim specifies 'factual questions' which maps to open-ended QA, where rates are consistently well above 5%.",
      "breaks_proof": false
    },
    {
      "question": "Could the claim be true for a specific model under specific conditions?",
      "verification_performed": "Searched for 'best AI model factual accuracy 2026 lowest error rate'. Some models with RAG (retrieval-augmented generation) can reduce hallucination rates significantly, but the claim says 'AI hallucinations' generically, not 'AI with RAG hallucinations'. Base model performance on factual QA consistently shows rates above 5% across all major benchmarks.",
      "finding": "Even with the most charitable interpretation (best model, easiest benchmark), open-ended factual QA hallucination rates exceed 5%. RAG-augmented systems may achieve lower rates, but the claim does not specify RAG.",
      "breaks_proof": false
    },
    {
      "question": "Are these benchmarks measuring hallucination correctly?",
      "verification_performed": "Searched for 'AI hallucination benchmark methodology criticism'. Found that hallucination measurement varies by benchmark \u2014 some measure confabulation (making up facts), others measure factual inconsistency. PersonQA and SimpleQA specifically test factual accuracy on verifiable questions, which directly matches the claim's scope of 'factual questions'.",
      "finding": "Benchmark methodology criticism exists but does not undermine our sources. PersonQA, SimpleQA, and AA-Omniscience all specifically measure factual accuracy on verifiable questions \u2014 directly relevant to the claim.",
      "breaks_proof": false
    }
  ],
  "verdict": {
    "value": "DISPROVED",
    "qualified": false,
    "qualifier": null,
    "reason": null
  },
  "key_results": {
    "n_confirmed": 3,
    "threshold": 3,
    "operator": ">=",
    "claim_holds": true
  },
  "generator": {
    "name": "proof-engine",
    "version": "1.1.0",
    "repo": "https://github.com/yaniv-golan/proof-engine",
    "generated_at": "2026-03-29"
  },
  "proof_py_url": "/proofs/ai-hallucinations-occur-on-fewer-than-5-of-factual/proof.py",
  "citation": {
    "doi": "10.5281/zenodo.19489820",
    "concept_doi": "10.5281/zenodo.19489819",
    "url": "https://proofengine.info/proofs/ai-hallucinations-occur-on-fewer-than-5-of-factual/",
    "author": "Proof Engine",
    "cite_bib_url": "/proofs/ai-hallucinations-occur-on-fewer-than-5-of-factual/cite.bib",
    "cite_ris_url": "/proofs/ai-hallucinations-occur-on-fewer-than-5-of-factual/cite.ris"
  },
  "depends_on": []
}