[
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Quelmithin",
  "n71_answer": "Quelmithin",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "tutoring session (weekly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "partner",
  "question": "What's my partner's name?",
  "gold_answer": "James",
  "n71_answer": "James",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "multivitamin",
  "n71_answer": "multivitamin",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'tutoring session (weekly)', but health_condition changed",
  "n71_answer": "trivia night Wednesdays",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "partner",
  "question": "What's my partner's name?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays rock climbing, and is part of book club.",
  "n71_answer": "rock climbing, pottery",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "varies by shift",
  "n71_answer": "shift schedule",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Dranthis Pickup, Xylorim Scooter",
  "n71_answer": "Therwyn Compact, Dranthis Pickup, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "around 10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "varies by shift",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of board game group.",
  "n71_answer": "road trips, reading books, basketball, knitting",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "30 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "food_allergy",
  "question": "What food allergies do I have?",
  "gold_answer": "eggs",
  "n71_answer": "eggs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Sarah, Emily, Alex",
  "n71_answer": "Sarah, Alex",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '30 min', but residence_location changed",
  "n71_answer": "30 min",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "food_allergy",
  "question": "What food allergies do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays rock climbing, and is part of hiking club.",
  "n71_answer": "guitar, rock climbing, camping, photography, hiking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "European Road Trip 2023",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "alone",
  "n71_answer": "Alone",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "trivia night Wednesdays",
  "n71_answer": "trivia night on Wednesdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Quorvel EV, Xylorim Scooter",
  "n71_answer": "Therwyn Compact -> Quorvel EV -> Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "European Road Trip 2023",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'alone', but relationship_status changed",
  "n71_answer": "Alone",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "trivia night on Wednesdays",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays pickleball, and is part of hiking club.",
  "n71_answer": "Playing pickleball, playing guitar",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off credit card debt",
  "n71_answer": "paying off credit card debt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Python programming",
  "n71_answer": "Python programming",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "engagement ring",
  "n71_answer": "engagement ring",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Veldrakon Spiral (TV), Whispers of Zarinthea (TV), The Quorath Codex (book)",
  "n71_answer": "The Veldrakon Spiral (TV) -> Whispers of Zarinthea (TV) -> The Quorath Codex (book)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for kids' education fund",
  "n71_answer": "saving for kids' education fund",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Uncertain — previously 'Python programming', but school changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "engagement ring",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of hiking club.",
  "n71_answer": "knitting, baking pastries, DIY coffee enthusiast, spending time outdoors, unwinding with hobbies",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Dyntheral",
  "n71_answer": "Dyntheral",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), Thyrelion (anime), a history documentary series",
  "n71_answer": "Thyrelion (anime) -> history documentary series",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Dyntheral', but health_condition changed",
  "n71_answer": "Dyntheral",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Spanish",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays soccer, and is part of volunteer group.",
  "n71_answer": "Woodworking, soccer, pickup basketball on Saturdays, learning Spanish, volunteering, hiking.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "about 20 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "Dranthazon",
  "n71_answer": "Dranthazon",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Thyrelion (anime), a history documentary series, Xelvari Dawn (book)",
  "n71_answer": "The Crown, Stranger Things, Schitt's Creek, The Mandalorian, The Good Place, Grey's Anatomy, The Morning Show, The Office, Parks and Recreation, history documentary series",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "therapy (weekly)",
  "n71_answer": "therapy (weekly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '20 min', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays running club, and is part of book club.",
  "n71_answer": "Cooking, attending trivia night on Wednesdays, watching history documentary series, running three times a week, spending time with partner Sarah, exploring new activities or places with Sarah, and regular volunteering at the Animal Shelter.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "backend migration",
  "n71_answer": "backend migration",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "Walking to work",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "interim coverage plan",
  "n71_answer": "interim coverage plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "vehicle",
  "question": "What car do I drive?",
  "gold_answer": "Xylorim Scooter",
  "n71_answer": "Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "library donor -> Xelvoria subscriber -> pickup basketball Saturdays -> Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "marketing campaign",
  "n71_answer": "Interactive Dinosaur Encyclopedia Project",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "Unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'interim coverage plan', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "vehicle",
  "question": "What car do I drive?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays cycling, and is part of board game group.",
  "n71_answer": "- Cooking  \n- Baking desserts  \n- Cycling  \n- Hunting  \n- Fishing  \n- Reading novels  \n- Vintage and thrift shopping  \n- Collecting vinyl records  \n- Pickup basketball on Saturdays  \n- Home decor refresh  \n- Binge-watching TV shows",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "low-sodium",
  "n71_answer": "low-sodium",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "pilates 3x/week",
  "n71_answer": "pilates 3x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "weekly tutoring session",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no alcohol",
  "n71_answer": "no alcohol",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'pilates 3x/week', but health_condition changed",
  "n71_answer": "yoga 2x/week",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays running club, and is part of book club.",
  "n71_answer": "cooking, reading The Quorath Codex, attending monthly brunch with college friends, participating in book club, User's Team, and Running Club memberships",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "mountain ski trip",
  "n71_answer": "Mountain ski trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with roommate",
  "n71_answer": "roommate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "brunch with college friends monthly",
  "n71_answer": "brunch with college friends monthly",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Alex, Jordan, Taylor",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with roommate', but relationship_status changed",
  "n71_answer": "Roommate",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays pickleball, and is part of board game group.",
  "n71_answer": "Listen to true crime podcasts, play video games, play pickleball, play RPGs, attend monthly brunch with college friends.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "Swimming 2x/week",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Daily Journal subscription, Pyrethon Plus",
  "n71_answer": "Xelvoria, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "therapy (weekly)",
  "n71_answer": "therapy (weekly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "walking daily",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Korathril', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays running club, and is part of wine tasting club.",
  "n71_answer": "gardening, participating in wine tasting club, participating in Running Club, attending trivia night Wednesdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "alone",
  "n71_answer": "alone",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy 2 times a week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Dranthis Pickup, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Dranthis Pickup, Vorantel SUV",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with partner",
  "n71_answer": "partner",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'single-family house', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays rock climbing, and is part of volunteer group.",
  "n71_answer": "rock climbing, gardening, pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "shift work (night)",
  "n71_answer": "shift work (night)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "40 min",
  "n71_answer": "40 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "infrastructure upgrade",
  "n71_answer": "Holistic Wellness Book Project",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "10pm-6am",
  "n71_answer": "10pm to 6am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "Thyrelion (anime) -> Xelvari Dawn -> The Quorath Codex",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "4-day work week",
  "n71_answer": "4-day work week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'infrastructure upgrade', but employer changed",
  "n71_answer": "Press List Building Project",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Pyravar CrossFit', but work_location changed",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "10pm to 6am",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays golf, and is part of book club.",
  "n71_answer": "photography, golf, trivia night on Wednesdays, book club, solo hike this weekend near the Bay Area",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental and vision bundle",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Daily Journal subscription, Dranthazon, Pyrethon Plus",
  "n71_answer": "land ownership seminar attendee -> Daily Journal subscription -> running club -> Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays basketball, and is part of running club.",
  "n71_answer": "- Play basketball  \n- Play guitar  \n- Attend game night with neighbors  \n- Watch history documentary series  \n- Use Dranthazon subscription content",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with parents",
  "n71_answer": "parents",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "sibling's wedding",
  "n71_answer": "sibling's wedding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Jessica, Mike, James",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with sibling",
  "n71_answer": "parents",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'sibling's wedding', but relationship_status changed",
  "n71_answer": "sibling's wedding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of hiking club.",
  "n71_answer": "antique collection",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "cross-country road trip (2 weeks)",
  "n71_answer": "cross-country road trip (2 weeks)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off student loans",
  "n71_answer": "paying off student loans",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "lactose intolerance",
  "n71_answer": "lactose intolerance",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Dranthis Pickup, Vorantel SUV",
  "n71_answer": "Therwyn Compact -> Dranthis Pickup -> Vorantel SUV -> Subaru Outback",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Pacific Coast Highway Road Trip",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "Uncertain — previously 'paying off student loans', but relationship_status changed",
  "n71_answer": "paying off student loans",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "lactose intolerance",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays golf, and is part of board game group.",
  "n71_answer": "Playing golf, experimenting with different seasonings and spices, cooking, participating in a board game group, pottery (previous hobby).",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "high blood pressure",
  "n71_answer": "high blood pressure",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Vorantel SUV",
  "n71_answer": "Therwyn Compact, Zyvanta Sedan, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "chiropractor (biweekly)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Thandrel YMCA', but residence_location changed",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of wine tasting club.",
  "n71_answer": "Watercolor painting, photography, creative use of leftovers, experimenting with hot peppers.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "Driving a Zyvanta Sedan and walking to work (depending on school enrollment and MBA studies).",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "stand-up desk",
  "n71_answer": "stand-up desk",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Dranthis Pickup",
  "n71_answer": "Zyvanta Sedan -> Therwyn Compact -> Dranthis Pickup",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "commuter rail",
  "n71_answer": "commuter rail",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Uncertain — previously 'Spanish', but school changed",
  "n71_answer": "Spanish",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays volleyball, and is part of running club.",
  "n71_answer": "volleyball, pottery",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "maxing out retirement fund",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "pickup basketball Saturdays",
  "n71_answer": "pickup basketball on Saturdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), a daily news podcast, Thyrelion (anime)",
  "n71_answer": "The Shawshank Redemption (watched 5 times) -> The King's Speech -> The Crown (finished June 2023) -> Pride and Prejudice (1995) -> Stranger Things (finished May 2023)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for wedding",
  "n71_answer": "saving for wedding",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'single-family house', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "travel_plan,upcoming_event,family_event",
  "question": "What do I have coming up? List everything you know.",
  "gold_answer": "The user is planning hiking trip out west, has college reunion coming up, and has Thanksgiving gathering.",
  "n71_answer": "Thanksgiving gathering, college reunion, hiking trip out west, pickup basketball on Saturdays, planned purchase of new couch",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "20 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "team lead",
  "n71_answer": "team lead",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "20 min",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'dentist (every 6 months)', but residence_location changed",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays golf, and is part of running club.",
  "n71_answer": "jewelry-making class, playing golf, woodworking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "flexible hours",
  "n71_answer": "flexible hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "marketplace health plan",
  "n71_answer": "marketplace health plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Kinthar Grill",
  "n71_answer": "Kinthar Grill",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "meal kit (Kelvimeal)",
  "n71_answer": "Kelvimeal",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Therwyn Compact car, Vorantel SUV, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'marketplace health plan', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Kinthar Grill', but work_location changed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "commute_method,commute_duration,work_location",
  "question": "Tell me everything you know about my commute.",
  "gold_answer": "The user commutes by subway for 50 min to Belvona Harbor.",
  "n71_answer": "- Commute duration: 50 minutes  \n- Commute method: subway  \n- Commute location: Belvona Harbor (office location)  \n- Commute experience: subway is generally convenient but can have delays  \n- Vehicle previously driven: Therwyn Compact car, then Vorantel SUV, currently Xylorim Scooter (though commute method is subway)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for kids' education fund",
  "n71_answer": "saving for kids' education fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "9-to-5 fixed",
  "n71_answer": "9-to-5 fixed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "residence_location",
  "question": "Where do I live?",
  "gold_answer": "Belvona Harbor",
  "n71_answer": "Belvona Harbor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "James, Alex, Taylor",
  "n71_answer": "James, Alex, Taylor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for wedding",
  "n71_answer": "saving for wedding",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "Uncertain — previously '9-to-5 fixed', but school changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "residence_location",
  "question": "Where do I live?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays tennis, and is part of wine tasting club.",
  "n71_answer": "Stand-up comedy open mics, wine tasting club, potluck dinner group, tennis, listening to daily news podcast, woodworking (previous hobby).",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Caeltura Systems",
  "n71_answer": "Caeltura Systems",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Quorvel EV, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "irregular",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "home gym",
  "n71_answer": "home gym",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'dentist (every 6 months)', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "You don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays basketball, and is part of book club.",
  "n71_answer": "- Play video games  \n- Play basketball  \n- Participate in a book club  \n- Collect rare coins  \n- Collect rare records  \n- Collect rare figurines",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "condo",
  "n71_answer": "condo",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "family reunion",
  "n71_answer": "Family Reunion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Sarah, Jessica, Taylor",
  "n71_answer": "Sarah, Jessica, Taylor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "townhouse",
  "n71_answer": "townhouse",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'family reunion', but relationship_status changed",
  "n71_answer": "Family Reunion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays basketball, and is part of hiking club.",
  "n71_answer": "Playing video games, basketball, hiking, attending game night with neighbors, going to music festivals.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "marketing lead",
  "n71_answer": "marketing lead",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "walking",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "relationship_status",
  "question": "What's my relationship status?",
  "gold_answer": "in a relationship",
  "n71_answer": "in a relationship",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Xelvoria subscriber -> book club -> Pyrethon Plus -> Kelvimeal",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "staff engineer",
  "n71_answer": "marketing lead",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "walking",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "Uncertain — previously 'hybrid (3 office + 2 remote)', but employer changed",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Velmara Taqueria', but work_location changed",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "relationship_status",
  "question": "What's my relationship status?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "in a relationship",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays soccer, and is part of book club.",
  "n71_answer": "woodworking, vinyl record collecting, soccer",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "By bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "weekly tutoring session",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Thyranote, Pyrethon Plus, Kelvimeal",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "subway",
  "n71_answer": "Subway",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Thandrel YMCA', but residence_location changed",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays pickleball, and is part of cycling club.",
  "n71_answer": "pickleball, guitar, trivia night on Wednesdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "keto",
  "n71_answer": "keto",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Xelvoria, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "15 min",
  "n71_answer": "15 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'physical therapy (2x/week)', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays golf, and is part of hiking club.",
  "n71_answer": "golf, photography",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "staff engineer",
  "n71_answer": "staff engineer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "interim coverage plan",
  "n71_answer": "interim coverage plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Zorvista Premium, Dranthazon, Pyrethon Plus",
  "n71_answer": "Zorvista Premium, hiking club, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "designer",
  "n71_answer": "designer",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "20 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'interim coverage plan', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Velmara Taqueria', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays basketball, and is part of hiking club.",
  "n71_answer": "Basketball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "10-to-7 fixed",
  "n71_answer": "10-to-7 fixed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "maxing out retirement fund",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "11pm-7am",
  "n71_answer": "11pm to 7am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "The New Yorker short stories, news online (The New York Times, The Guardian), Whispers of Zarinthea (TV), Xelvari Dawn (book)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "10-to-7 fixed",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "Uncertain — previously 'maxing out retirement fund', but school changed",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "11pm to 7am",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "volleyball, wine tasting, potluck dinners, video games, reading fiction, reading The New Yorker short stories, reading news online (The New York Times, The Guardian) during lunch break, reading opinion pieces and editorials",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "shift work (day)",
  "n71_answer": "shift work (day)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer + supplemental",
  "n71_answer": "employer + supplemental",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "remote (no commute)",
  "n71_answer": "Remote",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off student loans",
  "n71_answer": "paying off student loans",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "The Quorath Codex (2023-03-05) -> Thyrelion (anime) (2023-03-08) -> Xelvari Dawn (book) (2023-03-15)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "9-to-5 fixed",
  "n71_answer": "9-to-5 fixed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "around 10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'employer + supplemental', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'remote (no commute)', but work_location changed",
  "n71_answer": "Around 10 min commute",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays pickleball, and is part of cycling club.",
  "n71_answer": "Playing video games, participating in game night with neighbors, cycling with the cycling club, vintage postcard collecting, vinyl record collecting, pickleball.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "tutoring session (weekly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "a true crime podcast, The Quorath Codex (book), Xelvari Dawn (book)",
  "n71_answer": "cooking -> cycling -> traditional Irish music and dance -> musette and bourrée -> traditional German music and dance -> polka and waltz -> history of immigration in the US (late 1800s and early 1900s) -> true crime podcast -> Xelvari Dawn (book)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "home gym",
  "n71_answer": "home gym",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously 'midnight-8am', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays cycling, and is part of volunteer group.",
  "n71_answer": "- Game night with neighbors  \n- Cooking  \n- Cycling  \n- Listening to true crime podcasts  \n- Interests in traditional Irish music and dance, musette and bourrée, traditional German music and dance, polka and waltz, and history of immigration in the US (late 1800s and early 1900s)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "baby shower",
  "n71_answer": "baby shower",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with roommate",
  "n71_answer": "roommate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "new laptop",
  "n71_answer": "a new laptop",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, Zorvista Premium, meal kit (Kelvimeal)",
  "n71_answer": "Thyranote, Zorvista Premium, Kelvimeal",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "parent's birthday",
  "n71_answer": "parents' birthday",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with roommate', but relationship_status changed",
  "n71_answer": "Mike",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "new laptop",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is reading, plays basketball, and is part of running club.",
  "n71_answer": "reading, playing basketball, running club, pickup basketball Saturdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental + vision bundle",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Lorwen & Sage",
  "n71_answer": "Lorwen & Sage",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "infrastructure upgrade",
  "n71_answer": "infrastructure upgrade",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "about 10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "vegetarian",
  "n71_answer": "vegetarian",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Veldrakon Spiral (TV), a daily news podcast, Thyrelion (anime)",
  "n71_answer": "The Veldrakon Spiral (TV) -> Thyrelion (anime)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer-provided health plan",
  "n71_answer": "dental + vision bundle",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Lorwen & Sage",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'infrastructure upgrade', but employer changed",
  "n71_answer": "infrastructure upgrade",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '10 min', but work_location changed",
  "n71_answer": "about 10 min",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "vegetarian",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays pickleball, and is part of wine tasting club.",
  "n71_answer": "pottery, grilling, experimenting with BBQ sauce flavors, pickleball, daily news podcast, science and technology stories, wine tasting club",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "running 3x/week",
  "n71_answer": "running 3x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "remote",
  "n71_answer": "remote",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Thyranote subscriber -> wine tasting club -> Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "weight training 4x/week",
  "n71_answer": "weight training 4x/week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Korathril', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays cycling, and is part of wine tasting club.",
  "n71_answer": "cycling, pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "dormitory",
  "n71_answer": "dormitory",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "family reunion",
  "n71_answer": "family reunion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Zypherix Labs",
  "n71_answer": "Zypherix Labs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Zorvista Premium, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Zorvista Premium, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "two-bedroom apartment",
  "n71_answer": "two-bedroom apartment",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'family reunion', but relationship_status changed",
  "n71_answer": "family reunion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is reading, plays running club, and is part of volunteer group.",
  "n71_answer": "tailgating, reading",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "30 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "electric car",
  "n71_answer": "electric car",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "James, Jordan, Taylor",
  "n71_answer": "James, Jordan, Taylor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Zorathel Peak Gym', but residence_location changed",
  "n71_answer": "Unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is drawing/painting, plays running club, and is part of wine tasting club.",
  "n71_answer": "Drawing/painting, biking, reading The Quorath Codex (book)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "child's school event",
  "n71_answer": "Child's school event",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "shared house",
  "n71_answer": "shared house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Korvath & Co.",
  "n71_answer": "Korvath & Co.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "a daily news podcast, Thyrelion (anime), a history documentary series",
  "n71_answer": "daily news podcast -> Thyrelion (anime) -> history documentary series",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "none currently",
  "n71_answer": "Child's school event",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'shared house', but relationship_status changed",
  "n71_answer": "shared house",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "Volleyball, gardening, wine tasting club, potluck dinner group, watching history documentary series, watching anime (Thyrelion)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "cross-country road trip (2 weeks)",
  "n71_answer": "Cross-country road trip (2 weeks)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "sibling's wedding",
  "n71_answer": "Sibling's wedding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Thornvale Crossing",
  "n71_answer": "Thornvale Crossing",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Quorvel EV, Vorantel SUV",
  "n71_answer": "Quorvel EV, Vorantel SUV, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "cross-country road trip (2 weeks)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'sibling's wedding', but relationship_status changed",
  "n71_answer": "sibling's wedding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Thornvale Crossing",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of cycling club.",
  "n71_answer": "Photography, landscape photography, cycling, running club membership, outdoor activities with partner, discussing hobbies with sibling.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "Bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no caffeine",
  "n71_answer": "no caffeine",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), a true crime podcast, Xelvari Dawn (book)",
  "n71_answer": "Whispers of Zarinthea (TV), true crime podcast, Xelvari Dawn (book)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Japanese",
  "n71_answer": "Japanese",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'bike', but school changed",
  "n71_answer": "bike",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "no caffeine",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays soccer, and is part of running club.",
  "n71_answer": "knitting, soccer, studying Japanese, listening to a true crime podcast, reading the book Xelvari Dawn, watching Whispers of Zarinthea (TV)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "bus",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Vorantel SUV",
  "n71_answer": "Quorvel EV, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "nut-free",
  "n71_answer": "nut-free",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays volleyball, and is part of cycling club.",
  "n71_answer": "volleyball, woodworking, cycling club, game night with neighbors",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "yoga 2x/week",
  "n71_answer": "yoga 2x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "hiking trip out west",
  "n71_answer": "California Coast Road Trip",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Dranthis Pickup, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Dranthis Pickup, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Dyntheral",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'yoga 2x/week', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "California Coast Road Trip",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays pickleball, and is part of book club.",
  "n71_answer": "Photography, pickleball, sugar-free baking, trivia night on Wednesdays, book club, famiglia dei savoia",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "swimming 2x/week",
  "n71_answer": "swimming 2x/week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays volleyball, and is part of running club.",
  "n71_answer": "Playing guitar, volleyball, watching history documentary series, reading Daily Journal, running (three times a week), swimming (2x/week).",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "Bohol Trip",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "baby shower",
  "n71_answer": "baby shower",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "chiropractor (biweekly)",
  "n71_answer": "Chiropractor every two weeks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Thyranote subscriber, book club, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "Bohol Trip",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'baby shower', but relationship_status changed",
  "n71_answer": "baby shower",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays rock climbing, and is part of book club.",
  "n71_answer": "Rock climbing, knitting, reading (The Quorath Codex), pickup basketball on Saturdays, guided tour of landmarks",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "varies by shift",
  "n71_answer": "varies by shift",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "game night with neighbors",
  "n71_answer": "game night with neighbors",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Zorvista Premium, meal kit (Kelvimeal)",
  "n71_answer": "Xelvoria, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "1am-9am",
  "n71_answer": "1am-9am",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Velthari Studio",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'walking daily', but health_condition changed",
  "n71_answer": "walking daily",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "game night with neighbors",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of running club.",
  "n71_answer": "knitting, sourdough bread baking, basketball, experimenting with hot sauces at home, experimenting with BBQ rub recipes, thinking about hosting a BBQ party soon, experimenting with wood for smoking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "mountain ski trip",
  "n71_answer": "Mountain ski trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with partner and child",
  "n71_answer": "partner and child",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no alcohol",
  "n71_answer": "no alcohol",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Zorvista Premium, Daily Journal subscription",
  "n71_answer": "Daily Journal subscription, wine tasting club, UBC applicant, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "Mountain ski trip",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with partner and child', but relationship_status changed",
  "n71_answer": "James",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "no alcohol",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "Playing games, cooking, volleyball, reading, books, game night with neighbors",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "11pm-5:30am (early riser)",
  "n71_answer": "11pm-5:30am (early riser)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "school",
  "question": "Where am I studying?",
  "gold_answer": "Quivira Springs Community College",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "nut-free",
  "n71_answer": "nut-free",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously '11pm-5:30am (early riser)', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "school",
  "question": "Where am I studying?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays pickleball, and is part of book club.",
  "n71_answer": "hiking, pottery, pickleball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "designer",
  "n71_answer": "designer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1 hour",
  "n71_answer": "1 hour",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Tesswick Bay",
  "n71_answer": "Tesswick Bay",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "By bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "pickup basketball Saturdays",
  "n71_answer": "book club",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Vorantel SUV, Xylorim Scooter, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "product manager",
  "n71_answer": "product manager",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Uncertain — previously 'Tesswick Bay', but employer changed",
  "n71_answer": "Quivira Springs",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'bike', but work_location changed",
  "n71_answer": "By Xylorim Scooter",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "pickup basketball Saturdays",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays basketball, and is part of book club.",
  "n71_answer": "basketball, gardening, listening to true crime podcasts",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "low-sugar",
  "n71_answer": "low-sugar",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy twice a week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Jessica, Alex, Chris",
  "n71_answer": "Alex, Chris",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no processed food",
  "n71_answer": "no processed food",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously 'midnight-8am', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays cycling, and is part of cycling club.",
  "n71_answer": "cycling",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer + supplemental",
  "n71_answer": "employer + supplemental",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "mobile app development",
  "n71_answer": "mobile app development",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "anniversary dinner",
  "n71_answer": "anniversary dinner",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Dranthazon, Pyrethon Plus",
  "n71_answer": "Xelvoria, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental + vision bundle",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'mobile app development', but employer changed",
  "n71_answer": "London Summer Vacation Project",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays golf, and is part of book club.",
  "n71_answer": "golf, guitar",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "50 min",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "Kelvimeal meal kit service -> volunteer group -> Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '50 min', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays basketball, and is part of volunteer group.",
  "n71_answer": "woodworking, baking, cooking, basketball, photography",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Zelthra Pack",
  "n71_answer": "Zelthra Pack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "zelthra pack --sourcemap",
  "n71_answer": "zelthra pack --sourcemap",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "single-app with /src and /lib",
  "n71_answer": "single-app with /src and /lib",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "tarvex verify --integration",
  "n71_answer": "tarvex verify --integration",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "github.com/aurora-team/docs",
  "n71_answer": "github.com/aurora-team/docs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 May 23, 2025 June 20",
  "n71_answer": "2025 April 11, 2025 May 23, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Quorix Forge",
  "n71_answer": "Quorix Forge",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "quorix forge --target prod",
  "n71_answer": "quorix forge --target prod",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "Uncertain — previously 'single-app with /src and /lib', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'tarvex verify --integration', but test_framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "github.com/aurora-team/docs",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Monday 11am, uses #aurora-dev for chat, and standups are async written updates in Slack.",
  "n71_answer": "The team coordinates through async written updates in Slack for standups, uses a volunteer-based weekly signup for on-call rotation, holds team meetings every Monday at 11am, and discusses development and project updates in the #aurora-dev channel.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.korinth",
  "n71_answer": "deploy/Dockerfile.korinth",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier production",
  "n71_answer": "orinthal publish --tier production",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "develop",
  "n71_answer": "develop",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 April 25, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "korinth-deploy.yaml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.zephyra",
  "n71_answer": "deploy/Dockerfile.korinth",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'stream.velturis.io/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "Uncertain — previously 'orinthal publish --tier production', but ci_config changed",
  "n71_answer": "orinthal publish --tier production",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "develop",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 90% for core modules test coverage, and require lead approval for core/ for reviews.",
  "n71_answer": "- Code review policy: lead approval for the core/ directory  \n- Test coverage target for core modules: 90%",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "sandbox.thandrel.io/aurora",
  "n71_answer": "sandbox.thandrel.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/zephyra/Dockerfile",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "team_lead",
  "question": "Who's the team lead?",
  "gold_answer": "Sarah Chen",
  "n71_answer": "Sarah Chen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Quorix Hub, Synthari Modules",
  "n71_answer": "Karnex Pack, Quorix Hub, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "sink.orinthal.dev/aurora",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'sandbox.thandrel.io/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/zephyra/Dockerfile', but docker_image changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "team_lead",
  "question": "Who's the team lead?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use trunk-based with feature flags branching on production, releasing monthly first Monday.",
  "n71_answer": "- The team releases monthly on the first Monday.  \n- The release schedule is considered important for maintaining consistency.  \n- Automating testing and deployment is recommended to streamline the release process.  \n- Creating a checklist for each release that includes tasks like code reviews, merging, and documentation updates is advised to ensure nothing is overlooked.  \n- Holding a brief meeting or sync-up a few days before the release helps the team align on priorities and address last-minute issues.  \n- The default branch for releases is production.  \n- Best practices for managing releases from the production branch include thorough testing before merging, maintaining a clear versioning system, using tags to mark release points, and communicating releases clearly with the team.  \n- The test coverage target is 90% for core modules, achieved through a mix of unit tests, integration tests, and end-to-end tests.  \n- The branching strategy is trunk-based development with feature flags, which helps keep the main branch always deployable.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "David Lee",
  "n71_answer": "David Lee",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Taehyun Kim",
  "n71_answer": "Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "test_coverage_target",
  "question": "What's our test coverage target?",
  "gold_answer": "90% for core modules",
  "n71_answer": "90%",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 April 11, 2025 June 6",
  "n71_answer": "2025 June 6",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Uncertain — previously 'Taehyun Kim', but team_lead changed",
  "n71_answer": "Minjun Lee",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "test_coverage_target",
  "question": "What's our test coverage target?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "primary and a secondary oncall, paired oncall rotation",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "monitor.narvex.io/proj-aurora",
  "n71_answer": "monitor.narvex.io/proj-aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/zephyra/Dockerfile",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "meeting_day",
  "question": "When's our team meeting?",
  "gold_answer": "every Thursday 2pm",
  "n71_answer": "Every Thursday at 2pm",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 23, 2025 June 6",
  "n71_answer": "2025 May 23, 2025 June 6",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "monitor.narvex.io/proj-aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'dev.orinthal.dev/aurora', but deploy_target changed",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/zephyra/Dockerfile', but docker_image changed",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "meeting_day",
  "question": "When's our team meeting?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Every Thursday at 2pm",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at wiki.aurora.dev/getting-started, and changelog at GitHub Releases page.",
  "n71_answer": "confluence.aurora.dev/aurora  \nwiki.aurora.dev/getting-started  \nGitHub Releases page",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "James Lee",
  "n71_answer": "James Lee",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "alert_channel",
  "question": "What's our alert channel?",
  "gold_answer": "#aurora-alerts",
  "n71_answer": "#aurora-alerts",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Pyronos Crypt",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Helen Chen",
  "n71_answer": "James Lee",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "alert_channel",
  "question": "What's our alert channel?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "#aurora-dev",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Friday 9:30am, uses #aurora-dev for chat, and standups are async written updates in Slack.",
  "n71_answer": "Our team coordinates using async written updates in Slack as the standup format, with development discussions in the #aurora-dev channel. Production alerts are sent to #aurora-alerts. Team meetings are scheduled every Friday at 9:30am. The team lead is currently Yuna Park.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Crysthene ORM",
  "n71_answer": "Crysthene ORM",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Crysthene.define('Model', {fields})",
  "n71_answer": "Crysthene.define('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "zyndra://base-main:9042/aurora",
  "n71_answer": "zyndra://base-main:9042/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Verisigil Lock, Sigilix Manager",
  "n71_answer": "Cryptara Vault, Verisigil Lock",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Thalnex Query",
  "n71_answer": "Crysthene ORM",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'zyndra://base-main:9042/aurora', but database changed",
  "n71_answer": "zyndra://base-main:9042/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is weekly Mon-Sun rotation, alerts go to #aurora-oncall-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "weekly Mon-Sun rotation",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Verithos-JWT-v2",
  "n71_answer": "Verithos-JWT-v2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/auth/kryptal/signin",
  "n71_answer": "/\tauth/kryptal/signin",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "watch.korinth.dev/aurora",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 9, 2025 June 20",
  "n71_answer": "2025 April 25, 2025 May 9, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Oathkeeper-mTLS-cert",
  "n71_answer": "Oathkeeper-mTLS-cert",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "Uncertain — previously '/auth/kryptal/signin', but auth_provider changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitFlow with release branches branching on release, releasing sprint-end release every 2 weeks.",
  "n71_answer": "- The default branch is release.  \n- The team uses GitFlow with release branches branching.  \n- The team releases a sprint-end release every 2 weeks.  \n- The sprint release cadence is every 2 weeks.  \n- The sprint deadline has been updated over time, with the latest deadline set for 2025 June 20.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "korinth-deploy.yaml",
  "n71_answer": "korinth-deploy.yaml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --canary",
  "n71_answer": "korinth release --canary",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "velturis-cr/aurora:main",
  "n71_answer": "velturis-cr/aurora:main",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.velturis",
  "n71_answer": "deploy/Dockerfile.velturis",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "changelog_location",
  "question": "Where's the changelog?",
  "gold_answer": "CHANGELOG.md in repo root",
  "n71_answer": "CHANGELOG.md in the repo root",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Pyronis Bundle, Quorix Hub",
  "n71_answer": "Karnex Pack, Pyronis Bundle, Quorix Hub",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "zephyra-pipeline.json",
  "n71_answer": "korinth-deploy.yaml",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target preview",
  "n71_answer": "korinth release --canary",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'velturis-cr/aurora:main', but deploy_target changed",
  "n71_answer": "velturis-cr/aurora:main",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.velturis', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.velturis",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "changelog_location",
  "question": "Where's the changelog?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "CHANGELOG.md in the repo root",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target no strict target, PR-reviewed test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "- Code review policy: 2 approvals + security review for auth/  \n- Test coverage target: no strict target; it's PR-reviewed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "zephyra-pipeline.json",
  "n71_answer": "zephyra-pipeline.json",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target preview",
  "n71_answer": "zephyra push --target preview",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.zephyra",
  "n71_answer": "deploy/Dockerfile.zephyra",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Kerberos Trove, Orinthal Sealed",
  "n71_answer": "Verisigil Lock, Kerberos Trove, Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "thandrel-flow.toml",
  "n71_answer": "zephyra-pipeline.json",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "zephyra push --target preview",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'zephyra-hub/aurora:prod', but deploy_target changed",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.zephyra', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.zephyra",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on main, releasing monthly first Monday.",
  "n71_answer": "monthly on the first Monday",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "korvel dev --open",
  "n71_answer": "korvel dev --open",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --watch",
  "n71_answer": "karnex test --watch",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Karnex Bundler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --analyze",
  "n71_answer": "karnex build --analyze",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "n71_answer": "Karnex Pack -> Dranith Stack -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "moxira serve --hot",
  "n71_answer": "korvel dev --open",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "quorix spec --run smoke",
  "n71_answer": "karnex test --watch",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "Karnex Bundler",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'karnex build --analyze', but build_tool changed",
  "n71_answer": "karnex build --analyze",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target no strict target, PR-reviewed test coverage, and require lead approval for core/ for reviews.",
  "n71_answer": "- code_review_policy: lead approval for the core/ directory  \n- test_coverage_target: no strict target; PR-reviewed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Pyravar Mapper",
  "n71_answer": "Pyravar Mapper",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.entity('Model', attrs)",
  "n71_answer": "Pyravar.entity('Model', attrs)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Zyndra Alter",
  "n71_answer": "Zyndra Alter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Pyronos Crypt",
  "n71_answer": "Cryptara Vault, Selvex Secrets, Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Vorathis ODM",
  "n71_answer": "Vorathis ODM",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Vorathis.collection('Model', spec)",
  "n71_answer": "Vorathis.collection('Model', spec)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Uncertain — previously 'Zyndra Alter', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at notion.so/aurora-team/new-hire, and changelog at GitHub Releases page.",
  "n71_answer": "aurora-docs.readthedocs.io",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "15 minutes",
  "n71_answer": "15 minutes",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "package_manager",
  "question": "What package manager do we use?",
  "gold_answer": "Dranith Stack",
  "n71_answer": "Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 May 23, 2025 June 20, 2025 July 4",
  "n71_answer": "2025 May 23, 2025 June 20, 2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-cert-check v3.2",
  "n71_answer": "oathkeeper-cert-check v3.2",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'SAML 2.0 SSO', but auth_provider changed",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "package_manager",
  "question": "What package manager do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-oncall-alerts, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "paired oncall (primary + secondary)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "CTO Soojung Lim",
  "n71_answer": "Soojung Lim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Kerberos Trove",
  "n71_answer": "Cryptara Vault, Selvex Secrets",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Rachel Yoo",
  "n71_answer": "Rachel Yoo",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "Uncertain — previously 'CTO Soojung Lim', but team_lead changed",
  "n71_answer": "Yuna Park",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Dravex Blocks v2.4, target no strict target, PR-reviewed test coverage, and require 2 approvals required for reviews.",
  "n71_answer": "- no strict test coverage target  \n- everything is PR-reviewed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "status.orinthal.dev/aurora",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "preview.zephyra.io/aurora",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.orinthal",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Pyronis Build",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Zelthra Lib, Dranith Stack",
  "n71_answer": "Tarvex Mod -> Zelthra Lib -> Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "metrics.thandrel.io/aurora-prod",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'preview.zephyra.io/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.orinthal', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.orinthal",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Pyronis Build",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "korvel dev --open",
  "n71_answer": "korvel dev --open",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "pyronis pack --optimize",
  "n71_answer": "pyronis pack --optimize",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Karnex Bundler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "pyronis check --snapshot",
  "n71_answer": "pyronis check --snapshot",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Zelthra Lib, Dranith Stack, Veltrion Pkg",
  "n71_answer": "Dranith Stack -> Veltrion Pkg -> Zelthra Lib",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "veltrion dev --port 3000",
  "n71_answer": "veltrion dev --port 3000",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --debug",
  "n71_answer": "tarvex compile --debug",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "Tarvex CLI",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'pyronis check --snapshot', but test_framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitFlow with release branches branching on main, releasing sprint-end release every 2 weeks.",
  "n71_answer": "bi-weekly sprint-end releases",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "API Key + HMAC",
  "n71_answer": "API Key + HMAC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/zenthos/id/authenticate",
  "n71_answer": "/zenthos/id/authenticate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "framework",
  "question": "What framework are we using?",
  "gold_answer": "Moxira",
  "n71_answer": "ReactNative",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 May 9, 2025 May 23",
  "n71_answer": "2025 April 11, 2025 May 9, 2025 May 23",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "API Key + HMAC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "cerberix-sso-bridge v1.9",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "Uncertain — previously '/zenthos/id/authenticate', but auth_provider changed",
  "n71_answer": "/zenthos/id/authenticate",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "framework",
  "question": "What framework are we using?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #vortex-alerts, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "paired oncall with a primary and a secondary oncall for each shift",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "crysthene://replica-db:5432/aurora-ro",
  "n71_answer": "crysthene://replica-db:5432/aurora-ro",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.object('Model', props)",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "biweekly Friday afternoon",
  "n71_answer": "biweekly on Friday afternoons",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Zelthra Lib, Quorix Hub, Dranith Stack",
  "n71_answer": "Zelthra Lib, Quorix Hub, Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "vorathis://cluster-main:27017/aurora",
  "n71_answer": "vorathis://cluster-main:27017/aurora",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "observe.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Veltrion Pkg, Synthari Modules",
  "n71_answer": "Karnex Pack",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "Uncertain — previously 'observe.velturis.io/aurora', but deploy_target changed",
  "n71_answer": "observe.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "watch.korinth.dev/aurora",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.thandrel",
  "n71_answer": "deploy/Dockerfile.thandrel",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 June 20, 2025 July 4",
  "n71_answer": "2025 April 11, 2025 June 20, 2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.korinth",
  "n71_answer": "deploy/Dockerfile.thandrel",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'zephyra-hub/aurora:prod', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "Uncertain — previously 'thandrel ship --channel stable', but ci_config changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Thalnex",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at notion.so/aurora-team/docs, onboarding guide at ONBOARDING.md in repo root, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "notion.so/aurora-team/docs",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Rachel Yoo",
  "n71_answer": "Rachel Yoo",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Taehyun Kim",
  "n71_answer": "Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "runbooks.aurora.internal/playbooks",
  "n71_answer": "runbooks.aurora.internal/playbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Synthari Modules",
  "n71_answer": "Dranith Stack",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Seungmin Oh",
  "n71_answer": "Seungmin Oh",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Uncertain — previously 'Taehyun Kim', but team_lead changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at github.com/aurora-team/onboarding, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "confluence.aurora.dev/aurora  \ndocs.aurora.internal/changelog",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "continuous deploy on merge",
  "n71_answer": "Continuously on merge",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 9, 2025 July 4",
  "n71_answer": "2025 April 25, 2025 May 9, 2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "1 hour",
  "n71_answer": "1 hour",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Cerberix-SAML-assertion', but auth_provider changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #aurora-pager, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded. (current)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Helen Chen",
  "n71_answer": "VP Helen Chen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "database",
  "question": "What database are we using?",
  "gold_answer": "Thalnex Store",
  "n71_answer": "Thalnex Store",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Zelthra Lib, Synthari Modules",
  "n71_answer": "Pyronis Bundle -> Zelthra Lib -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Taehyun Kim",
  "n71_answer": "VP Helen Chen",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "database",
  "question": "What database are we using?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "The team's on-call rotation is paired oncall (primary + secondary), providing backup for the primary in case things get hectic.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "follow-the-sun across timezones",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Quorix Hub, Veltrion Pkg",
  "n71_answer": "Karnex Pack, Quorix Hub, Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "JWT Bearer",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '4 hours', but auth_provider changed",
  "n71_answer": "4 hours",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on trunk, releasing sprint-end release every 2 weeks.",
  "n71_answer": "Sprint-end releases every 2 weeks",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "github.com/aurora-team/runbooks",
  "n71_answer": "github.com/aurora-team/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 March 28, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Jiwon Han",
  "n71_answer": "Jiwon Han",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "github.com/aurora-team/runbooks",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use ship/show/ask branching branching on main, releasing sprint-end release every 2 weeks.",
  "n71_answer": "- Release cadence is every 2 weeks  \n- Releases are sprint-end releases  \n- The approach helps keep things fresh and allows quick iteration  \n- Clear goals are established for each sprint to prioritize work  \n- Visual task management tools are recommended for communication and tracking progress in real-time  \n- Regular check-ins are conducted to keep the team aligned and address roadblocks promptly  \n- As the deadline approaches, tasks are prioritized based on impact and urgency, focusing on critical items first  \n- Frequent reviews or stand-ups are held to monitor progress and discuss potential roadblocks  \n- Tools like Kanban boards or Gantt charts are used to visually represent task status and timelines for keeping on track  \n- The current sprint deadline is 2025 June 20",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "monorepo with /apps and /packages",
  "n71_answer": "layered with /domain, /infra, /api",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --release",
  "n71_answer": "karnex build --analyze",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Karnex Bundler",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --coverage",
  "n71_answer": "karnex test --coverage",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "docs.aurora.internal/wiki",
  "n71_answer": "docs.aurora.internal/wiki",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Pyronis Bundle, Veltrion Pkg",
  "n71_answer": "Karnex Pack -> Pyronis Bundle -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "layered with /domain, /infra, /api",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --analyze",
  "n71_answer": "tarvex compile --release",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Pyronis Build', but framework changed",
  "n71_answer": "Unity",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'karnex test --coverage', but test_framework changed",
  "n71_answer": "karnex test --coverage",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #zenith-incidents, and runbook is at notion.so/aurora-team/incidents.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.graph('Model', edges)",
  "n71_answer": "Thalnex.graph('Model', edges)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "weekly Sunday 2am UTC",
  "n71_answer": "weekly Sunday 2am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Orinthal Sealed",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Zyndra.struct('Model', types)",
  "n71_answer": "Thalnex.graph('Model', edges)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'weekly Sunday 2am UTC', but database changed",
  "n71_answer": "weekly Sunday 2am UTC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #vortex-alerts, and runbook is at docs.aurora.internal/runbooks.",
  "n71_answer": "Our Project — on_call_rotation_type: paired oncall",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "korinth-repo/aurora:stable",
  "n71_answer": "korinth-repo/aurora:stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "volunteer-based with weekly signup",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Quorix Hub, Veltrion Pkg",
  "n71_answer": "Tarvex Mod -> Quorix Hub -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "thandrel-images/aurora:release",
  "n71_answer": "korinth-repo/aurora:stable",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'drain.zephyra.io/proj-42', but deploy_target changed",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on release, releasing monthly first Monday.",
  "n71_answer": "Our Project — release_schedule: monthly on the first Monday  \nOur Project — default_branch: release  \nOur Project — workflow: GitHub Flow with PR reviews branching  \nOur Project — deploy_command: korinth release --prod  \nOur Project — ci_config: korinth-deploy.yaml  \nOur Project — deployment_platform: Orinthal Serve",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.object('Model', props)",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "pipe.thandrel.io/aurora-logs",
  "n71_answer": "pipe.thandrel.io/aurora-logs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Sigilix Manager, Pyronos Crypt",
  "n71_answer": "Sigilix Manager, Kerberos Trove, Pyronos Crypt",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-prod:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "pipe.thandrel.io/aurora-logs",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at docs.aurora.internal/wiki, onboarding guide at notion.so/aurora-team/new-hire, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "docs.aurora.internal/wiki",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Keldaris Link",
  "n71_answer": "Keldaris Link",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.node('Model', shape)",
  "n71_answer": "Keldaris.node('Model', shape)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "trunk",
  "n71_answer": "trunk",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Zelthra Lib, Veltrion Pkg",
  "n71_answer": "Tarvex Mod -> Zelthra Lib -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Thalnex Query",
  "n71_answer": "Keldaris Link",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Keldaris.node('Model', shape)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'pyravar://data-primary:3306/aurora', but database changed",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Veltrion UI Kit v2, target 90% for core modules test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "2 approvals + a security review for the auth/ directory, test coverage target for core modules: 90%",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Zenthos-Session-v1",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Crysthene.define('Model', {fields})",
  "n71_answer": "Crysthene.define('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Sigilix Manager, Velthri Keys",
  "n71_answer": "Cryptara Vault -> Sigilix Manager -> Velthri Keys",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "zenthos-session-middleware v1.8",
  "n71_answer": "zenthos-session-middleware v1.8",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Zenthos-Session-v1', but auth_provider changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at docs.aurora.internal/wiki, onboarding guide at wiki.aurora.dev/getting-started, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "docs.aurora.internal/wiki  \ndocs.aurora.internal/changelog  \nwiki.aurora.dev/getting-started  \ndocs.aurora.internal/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "8 hours",
  "n71_answer": "8 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Quorix Spec",
  "n71_answer": "Quorix Spec",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Sigilix Manager, Velthri Keys, Orinthal Sealed",
  "n71_answer": "Sigilix Manager -> Velthri Keys -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Oathkeeper-mTLS-cert",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-pkce-guard v1.4",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '8 hours', but auth_provider changed",
  "n71_answer": "8 hours",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "paired oncall (primary and secondary oncall)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Hyunwoo Nam",
  "n71_answer": "Hyunwoo Nam",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "n71_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "James Lee",
  "n71_answer": "James Lee",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "paired oncall (primary and secondary)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Pyronis Build",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "pyronis pack --quick",
  "n71_answer": "pyronis pack --quick",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Tarvex Verify",
  "n71_answer": "Tarvex Verify",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "tarvex verify --all",
  "n71_answer": "tarvex verify --all",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "every 12 hours with 7-day retention",
  "n71_answer": "Every 12 hours",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "\"DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Velthri Keys",
  "n71_answer": "Cryptara Vault, Selvex Secrets, Velthri Keys",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Dranith Compiler",
  "n71_answer": "Pyronis Build",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "dranith compile --emit bundle",
  "n71_answer": "pyronis pack --quick",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Uncertain — previously 'Tarvex Verify', but framework changed",
  "n71_answer": "Tarvex Verify",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'tarvex verify --all', but test_framework changed",
  "n71_answer": "tarvex verify --all",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "every 12 hours",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Veltrion UI Kit v2, target 80% minimum overall test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "- Test coverage target is 80% minimum overall  \n- Code review policy requires 2 approvals plus a security review for the auth/ directory  \n- Use of automated static analysis tools to flag potential vulnerabilities in authentication code  \n- Use of checklists for common security pitfalls in authentication (password hashing, input validation, session management)  \n- Encouragement for reviewers to critically assess security aspects during reviews  \n- Integration of documentation updates into the development workflow as part of the definition of done  \n- Use of version control for documentation to track changes and ensure up-to-date information  \n- Regularly scheduled documentation reviews or sprints to identify outdated content and areas for improvement",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at midnight KST",
  "n71_answer": "daily at midnight KST",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Zyndra Alter",
  "n71_answer": "Zyndra Alter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Tarvex Mod, Zelthra Lib",
  "n71_answer": "Karnex Pack -> Tarvex Mod -> Zelthra Lib",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at midnight KST",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.node('Model', shape)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Uncertain — previously 'Zyndra Alter', but database changed",
  "n71_answer": "Zyndra Alter",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at docs.aurora.internal/onboarding, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "confluence.aurora.dev/aurora",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "narvex-registry/aurora:latest",
  "n71_answer": "my-lambda-image",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/narvex/Dockerfile",
  "n71_answer": "infra/narvex/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "Veltrion UI Kit v2",
  "n71_answer": "Veltrion UI Kit v2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Selvex Secrets, Kerberos Trove, Orinthal Sealed",
  "n71_answer": "Selvex Secrets -> Kerberos Trove -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "korinth-repo/aurora:stable",
  "n71_answer": "narvex-registry/aurora:latest",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'dev.orinthal.dev/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/narvex/Dockerfile', but docker_image changed",
  "n71_answer": "infra/narvex/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #zenith-incidents, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "staging.narvex.internal/aurora",
  "n71_answer": "staging.narvex.internal/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/velturis/Dockerfile",
  "n71_answer": "infra/velturis/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "thandrel-images/aurora:release",
  "n71_answer": "thandrel-images/aurora:release",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Sigilix Manager, Velthri Keys",
  "n71_answer": "Sigilix Manager, Verisigil Lock, Velthri Keys",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "sink.orinthal.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "velturis deploy --region ap-northeast",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'staging.narvex.internal/aurora', but deploy_target changed",
  "n71_answer": "staging.narvex.internal/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/velturis/Dockerfile', but docker_image changed",
  "n71_answer": "infra/velturis/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "thandrel-images/aurora:release",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Friday 9:30am, uses #aurora-dev for chat, and standups are Monday/Thursday sync, async rest.",
  "n71_answer": "- Standup format: Monday/Thursday sync, async rest  \n- Team meeting schedule: every Friday at 9:30am  \n- Team channel: #aurora-dev  \n- Production alerts go to #aurora-pager  \n- Memberships: The Team, Our Team  \n- Onboarding guide: wiki.aurora.dev/getting-started",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "modular with /modules and /shared",
  "n71_answer": "modular with /modules and /shared",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --ci",
  "n71_answer": "karnex test --ci",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "CMake",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --prod",
  "n71_answer": "karnex build --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "secret_manager",
  "question": "What secret manager do we use?",
  "gold_answer": "Orinthal Sealed",
  "n71_answer": "Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Veltrion Pkg, Synthari Modules",
  "n71_answer": "Synthari Modules",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "monorepo with /apps and /packages",
  "n71_answer": "monorepo with /apps and /packages",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "zelthra assert --suite all",
  "n71_answer": "karnex test --ci",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'karnex build --prod', but build_tool changed",
  "n71_answer": "karnex build --prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "secret_manager",
  "question": "What secret manager do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at wiki.aurora.dev/getting-started, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "aurora-docs.readthedocs.io  \ndocs.aurora.internal/changelog  \nwiki.aurora.dev/getting-started  \ndocs.aurora.internal/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "status.orinthal.dev/aurora",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "orinthal publish --tier sandbox",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/orinthal/Dockerfile",
  "n71_answer": "infra/orinthal/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "JWT Bearer",
  "n71_answer": "JWT Bearer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 May 9, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 May 9, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "metrics.thandrel.io/aurora-prod",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "velturis deploy --region us-east",
  "n71_answer": "orinthal publish --tier sandbox",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'logs.korinth.dev/aurora', but deploy_target changed",
  "n71_answer": "#aurora-oncall-alerts",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/orinthal/Dockerfile', but docker_image changed",
  "n71_answer": "infra/orinthal/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Moxira Components v3, target 90% for core modules test coverage, and require 1 approval + CI green for reviews.",
  "n71_answer": "1 approval + CI green",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "24 hours",
  "n71_answer": "24 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-hmac-validator v2.0",
  "n71_answer": "phalanx-hmac-validator v2.0",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Synthari Modules",
  "n71_answer": "Quorix Hub -> Dranith Stack -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'Session Cookie + CSRF', but auth_provider changed",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use ship/show/ask branching branching on release, releasing monthly first Monday.",
  "n71_answer": "- Release schedule: monthly on the first Monday  \n- Default branch: release  \n- Branching strategy: ship/show/ask  \n- Code review policy: any team member can approve  \n- Deadline: 2025 April 25",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Vorathis.collection('Model', spec)",
  "n71_answer": "Vorathis.collection('Model', spec)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "docs.aurora.internal/onboarding",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 April 25, 2025 May 9",
  "n71_answer": "March 28, 2025; April 25, 2025; May 9, 2025",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at midnight KST",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.entity('Model', attrs)",
  "n71_answer": "Vorathis.collection('Model', spec)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'pyravar://data-primary:3306/aurora', but database changed",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Wednesday 10am, uses #aurora-core for chat, and standups are 15-min daily video call.",
  "n71_answer": "Our Team coordinates with a meeting schedule every Wednesday at 10am, uses the #aurora-core channel for discussions and updates, has a 15-minute daily video call standup format, follows a biweekly on-call rotation, and manages code reviews with a policy that any team member can approve. The default branch for version control is trunk, and the team uses trunk-based with feature flags branching.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Taehyun Kim",
  "n71_answer": "VP Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Emily Cho",
  "n71_answer": "Emily Cho",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "deploy_target",
  "question": "Where do we deploy?",
  "gold_answer": "Velturis Edge",
  "n71_answer": "Velturis Edge",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 May 23, 2025 June 6",
  "n71_answer": "2025 March 28, 2025 May 23, 2025 June 6",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "Director Hyunwoo Nam",
  "n71_answer": "VP Taehyun Kim",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Emily Cho', but team_lead changed",
  "n71_answer": "Emily Cho",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "deploy_target",
  "question": "Where do we deploy?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Velturis Edge",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use release-train with weekly cuts branching on develop, releasing biweekly Friday afternoon.",
  "n71_answer": "release-train with weekly cuts branching",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Zyndra.struct('Model', types)",
  "n71_answer": "Zyndra.struct('Model', types)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Dranith Stack, Synthari Modules",
  "n71_answer": "Tarvex Mod -> Dranith Stack -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-prod:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Zyndra.struct('Model', types)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is biweekly rotation, alerts go to #aurora-alerts, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "On-call rotation frequency is biweekly.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Quorix Spec",
  "n71_answer": "Quorix Spec",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "quorix spec --run all",
  "n71_answer": "quorix spec --run all",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Zelthra Pack",
  "n71_answer": "Zelthra Pack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "zelthra pack --minify",
  "n71_answer": "zelthra pack --minify",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Quorix Hub, Synthari Modules",
  "n71_answer": "Pyronis Bundle, Quorix Hub, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Pyronis Check",
  "n71_answer": "Quorix Spec",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "pyronis check --snapshot",
  "n71_answer": "pyronis check --snapshot",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Zelthra Pack', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'zelthra pack --minify', but build_tool changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Kevin Park",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target 80% minimum overall test coverage, and require 2 approvals required for reviews.",
  "n71_answer": "2 approvals required for code review  \n80% minimum overall test coverage target",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "docs.aurora.internal/onboarding",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Kerberos Trove, Pyronos Crypt",
  "n71_answer": "Verisigil Lock, Kerberos Trove, Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Jiwon Han",
  "n71_answer": "Jiwon Han",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 80% minimum overall test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "- Test coverage target: 80% minimum overall  \n- Code review policy: 2 approvals + a security review for the auth/ directory",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Tarvex CLI",
  "n71_answer": "Karnex Bundler",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "dranith probe --mode fast",
  "n71_answer": "zelthra assert --suite all",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "workspace with /services and /common",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --debug",
  "n71_answer": "karnex bundle",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Pyronos Crypt, Orinthal Sealed",
  "n71_answer": "Verisigil Lock, Pyronos Crypt, Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Tarvex CLI",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "zelthra assert --suite all",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "Uncertain — previously 'workspace with /services and /common', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'tarvex compile --debug', but build_tool changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Wednesday 10am, uses #vortex-team for chat, and standups are weekly sync + daily async.",
  "n71_answer": "- Standup format: weekly sync + daily async  \n- Meeting schedule: every Wednesday at 10am  \n- Team channel: #vortex-team  \n- On-call rotation: follow-the-sun across timezones",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "API Key + HMAC",
  "n71_answer": "API Key + HMAC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-hmac-validator v2.0",
  "n71_answer": "phalanx-hmac-validator v2.0",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Zenthos-Session-v1",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Dranith Compiler",
  "n71_answer": "Dranith Compiler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Pyronos Crypt, Orinthal Sealed",
  "n71_answer": "Pyronos Crypt -> Kerberos Trove",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "API Key + HMAC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Zenthos-Session-v1', but auth_provider changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 70% overall, 95% for auth test coverage, and require 1 approval + CI green for reviews.",
  "n71_answer": "- Code review policy: 1 approval + CI green  \n- CI pipeline must pass before merging any changes  \n- Comprehensive test coverage including unit tests, integration tests, and end-to-end tests  \n- Test coverage targets: 95% for authentication, 70% overall  \n- Use of pytest testing framework with coverage monitoring tools like pytest-cov and coverage.py  \n- Prehooks to prevent code upload if it does not pass compilation, tests, and validation  \n- Pair programming  \n- Code reviews  \n- Use of feature/integration/abstraction flags instead of branching in source control  \n- Trunk based development with continuous integration in staging environment",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/cerberix/saml/sso-init",
  "n71_answer": "/cerberix/saml/sso-init",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Veltrion Pkg",
  "n71_answer": "Quorix Hub -> Dranith Stack -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/auth/kryptal/signin",
  "n71_answer": "/cerberix/saml/sso-init",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'SAML 2.0 SSO', but auth_provider changed",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use trunk-based with feature flags branching on main, releasing weekly on Tuesday.",
  "n71_answer": "- Release cadence: weekly on Tuesday  \n- Release cadence depends on sprint feedback  \n- Changelog location: GitHub Releases page  \n- The team gathers insights from sprint feedback to potentially adjust the release process  \n- Releases aim to maintain stability and incorporate testing before deployment  \n- Communication within the team about the release schedule and process is encouraged",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "cerberix-sso-bridge v1.9",
  "n71_answer": "cerberix-sso-bridge v1.9",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "30 minutes",
  "n71_answer": "30 minutes",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "docs.aurora.internal/runbooks",
  "n71_answer": "docs.aurora.internal/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Selvex Secrets, Verisigil Lock, Orinthal Sealed",
  "n71_answer": "Selvex Secrets, Orinthal Sealed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "zenthos-csrf-guard v1.1",
  "n71_answer": "zenthos-csrf-guard v1.1",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '30 minutes', but auth_provider changed",
  "n71_answer": "30 minutes",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/runbooks",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at github.com/aurora-team/docs, onboarding guide at docs.aurora.internal/onboarding, and changelog at notion.so/aurora-team/releases.",
  "n71_answer": "github.com/aurora-team/docs",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --canary",
  "n71_answer": "korinth release --canary",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "Moxira Components v3",
  "n71_answer": "Moxira Components v3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Pyronos Crypt, Velthri Keys",
  "n71_answer": "Kerberos Trove, Pyronos Crypt, Velthri Keys",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "sandbox.thandrel.io/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "korinth release --canary",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'logs.narvex.io/aurora/drain', but deploy_target changed",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at docs.aurora.internal/onboarding, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "aurora-docs.readthedocs.io  \ndocs.aurora.internal/onboarding  \ndocs.aurora.internal/changelog",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 }
]