[
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Quelmithin",
  "n71_answer": "Quelmithin",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "tutoring session (weekly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "partner",
  "question": "What's my partner's name?",
  "gold_answer": "James",
  "n71_answer": "James",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "multivitamin",
  "n71_answer": "multivitamin",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'tutoring session (weekly)', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "partner",
  "question": "What's my partner's name?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays rock climbing, and is part of book club.",
  "n71_answer": "trivia night Wednesdays, pottery (previous), rock climbing (current)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-001",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "varies by shift",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Dranthis Pickup, Xylorim Scooter",
  "n71_answer": "Therwyn Compact, Dranthis Pickup, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of board game group.",
  "n71_answer": "basketball, knitting",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-002",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "30 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "food_allergy",
  "question": "What food allergies do I have?",
  "gold_answer": "eggs",
  "n71_answer": "eggs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Sarah, Emily, Alex",
  "n71_answer": "Emily, Sarah, Alex",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '30 min', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "food_allergy",
  "question": "What food allergies do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays rock climbing, and is part of hiking club.",
  "n71_answer": "pickup basketball on Saturdays, rock climbing, guitar, photography, hiking, camping",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-003",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "European Road Trip 2023 from London to Nice via Harwich, Amsterdam, Luxembourg, Lucerne, Lake Como, Lyon, and Reims.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "alone",
  "n71_answer": "Alone",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "trivia night Wednesdays",
  "n71_answer": "trivia night on Wednesdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Quorvel EV, Xylorim Scooter",
  "n71_answer": "Therwyn Compact, Quorvel EV, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "European Road Trip 2023",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'alone', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "trivia night on Wednesdays",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays pickleball, and is part of hiking club.",
  "n71_answer": "Playing pickleball, playing guitar.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-004",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off credit card debt",
  "n71_answer": "paying off credit card debt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Python programming",
  "n71_answer": "Python programming",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "engagement ring",
  "n71_answer": "Engagement Ring Purchase Project",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Veldrakon Spiral (TV), Whispers of Zarinthea (TV), The Quorath Codex (book)",
  "n71_answer": "The Veldrakon Spiral (TV), Whispers of Zarinthea (TV), The Quorath Codex (book)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for kids' education fund",
  "n71_answer": "saving for kids' education fund",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Uncertain — previously 'Python programming', but school changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of hiking club.",
  "n71_answer": "Unwinding with hobbies, spending time outdoors, knitting (scarves and blankets), playing basketball, baking pastries, DIY coffee enthusiast.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-005",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "Dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Dyntheral",
  "n71_answer": "Dyntheral",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "\"The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), Thyrelion (anime), a history documentary series",
  "n71_answer": "Whispers of Zarinthea, The Witcher, Shadow and Bone, Stranger Things, The Mandalorian",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Dyntheral', but health_condition changed",
  "n71_answer": "Dyntheral",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays soccer, and is part of volunteer group.",
  "n71_answer": "Watch Thyrelion (anime)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-006",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "\"The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "about 20 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "Dranthazon",
  "n71_answer": "Dranthazon, Netflix, Disney+, Hulu, Apple TV+, Peacock, HBO, Amazon Prime Video",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Thyrelion (anime), a history documentary series, Xelvari Dawn (book)",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "therapy (weekly)",
  "n71_answer": "therapy (weekly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '20 min', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays running club, and is part of book club.",
  "n71_answer": "Cooking, running three times a week, watching history documentary series, attending trivia night on Wednesdays, spending time with partner Sarah, experimenting with new recipes, trying different cuisines, and decorating new home.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-007",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "backend migration",
  "n71_answer": "backend migration",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "Walking",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "interim coverage plan",
  "n71_answer": "interim coverage plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "vehicle",
  "question": "What car do I drive?",
  "gold_answer": "Xylorim Scooter",
  "n71_answer": "Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "Kelvimeal",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "marketing campaign",
  "n71_answer": "Interactive Dinosaur Encyclopedia Project",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "bus",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'interim coverage plan', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but work_location changed",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "vehicle",
  "question": "What car do I drive?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays cycling, and is part of board game group.",
  "n71_answer": "- Pickup basketball Saturdays  \n- Hosting weekly game nights with friends  \n- Cycling  \n- Fishing  \n- Hunting",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-008",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "low-sodium",
  "n71_answer": "low-sodium",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "pilates 3x/week",
  "n71_answer": "pilates 3x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "weekly tutoring sessions",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Quorvel EV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no alcohol",
  "n71_answer": "no alcohol",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'pilates 3x/week', but health_condition changed",
  "n71_answer": "yoga 2x/week",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays running club, and is part of book club.",
  "n71_answer": "running club, cooking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-009",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "mountain ski trip",
  "n71_answer": "Mountain ski trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with roommate",
  "n71_answer": "roommate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "brunch with college friends monthly",
  "n71_answer": "brunch with college friends monthly",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Alex, Jordan, Taylor",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with roommate', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "brunch with college friends monthly",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays pickleball, and is part of board game group.",
  "n71_answer": "Listening to true crime podcasts, playing video games (RPGs), playing pickleball, listening to indie folk music on Spotify",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-010",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "Dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Daily Journal subscription, Pyrethon Plus",
  "n71_answer": "Daily Journal, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "therapy (weekly)",
  "n71_answer": "therapy (weekly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "walking daily",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Korathril', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays running club, and is part of wine tasting club.",
  "n71_answer": "Gardening, running club, trivia night Wednesdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-011",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "alone",
  "n71_answer": "Alone",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy 2 times a week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Dranthis Pickup, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Dranthis Pickup, Vorantel SUV",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with partner",
  "n71_answer": "partner",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'single-family house', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays rock climbing, and is part of volunteer group.",
  "n71_answer": "rock climbing, pickup basketball on Saturdays, gardening",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-012",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "shift work (night)",
  "n71_answer": "shift work (night)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "40 min",
  "n71_answer": "40 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "infrastructure upgrade",
  "n71_answer": "infrastructure upgrade",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "10pm-6am",
  "n71_answer": "10pm to 6am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "The Quorath Codex, Xelvari Dawn, Thyrelion (anime)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "4-day work week",
  "n71_answer": "4-day work week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "around 10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'infrastructure upgrade', but employer changed",
  "n71_answer": "infrastructure upgrade",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Pyravar CrossFit', but work_location changed",
  "n71_answer": "Lorwen & Sage",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "10pm to 6am",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays golf, and is part of book club.",
  "n71_answer": "Photography, golf, trivia night on Wednesdays, solo hiking near Bay Area this weekend",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-013",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental and vision bundle",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Daily Journal subscription, Dranthazon, Pyrethon Plus",
  "n71_answer": "Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dermatologist (monthly)",
  "n71_answer": "dermatologist (monthly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays basketball, and is part of running club.",
  "n71_answer": "- Watch history documentary series  \n- Play basketball  \n- Play guitar  \n- Watch musicals (e.g., La La Land, Hamilton)  \n- Attend game night with neighbors",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-014",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with parents",
  "n71_answer": "parents",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "sibling's wedding",
  "n71_answer": "sibling's wedding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Jessica, Mike, James",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with sibling",
  "n71_answer": "sibling",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'sibling's wedding', but relationship_status changed",
  "n71_answer": "sibling's wedding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of hiking club.",
  "n71_answer": "pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-015",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "cross-country road trip (2 weeks)",
  "n71_answer": "Cross-country road trip (2 weeks)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off student loans",
  "n71_answer": "paying off student loans",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "lactose intolerance",
  "n71_answer": "lactose intolerance",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Dranthis Pickup, Vorantel SUV",
  "n71_answer": "Therwyn Compact, Dranthis Pickup, Vorantel SUV, Subaru Outback",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "PCH Road Trip",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "Uncertain — previously 'paying off student loans', but relationship_status changed",
  "n71_answer": "paying off student loans",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays golf, and is part of board game group.",
  "n71_answer": "Play golf, pottery, game night with neighbors",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-016",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "high blood pressure",
  "n71_answer": "high blood pressure",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "chiropractor (biweekly)",
  "n71_answer": "chiropractor (biweekly)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Thandrel YMCA', but residence_location changed",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "health_condition",
  "question": "What health condition do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of wine tasting club.",
  "n71_answer": "Watercolor painting, photography (landscape photography), cooking a lot at home lately, experimenting with different types of hot peppers, regular brunch with college friends monthly, watching \"Whispers of Zarinthea\" (TV).",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-017",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "walking to work",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "stand-up desk",
  "n71_answer": "stand-up desk",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Dranthis Pickup",
  "n71_answer": "Zyvanta Sedan -> Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "commuter rail",
  "n71_answer": "commuter rail",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Uncertain — previously 'Spanish', but school changed",
  "n71_answer": "JD",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays volleyball, and is part of running club.",
  "n71_answer": "Volleyball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-018",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "maxing out retirement fund",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "pickup basketball Saturdays",
  "n71_answer": "pickup basketball on Saturdays",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), a daily news podcast, Thyrelion (anime)",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for wedding",
  "n71_answer": "saving for wedding",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'single-family house', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "travel_plan,upcoming_event,family_event",
  "question": "What do I have coming up? List everything you know.",
  "gold_answer": "The user is planning hiking trip out west, has college reunion coming up, and has Thanksgiving gathering.",
  "n71_answer": "Thanksgiving gathering, college reunion",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-019",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "about 20 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "team lead",
  "n71_answer": "team lead",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "around 30 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'dentist (every 6 months)', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays golf, and is part of running club.",
  "n71_answer": "playing golf, woodworking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-020",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "flexible hours",
  "n71_answer": "flexible hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "marketplace health plan",
  "n71_answer": "marketplace health plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Kinthar Grill",
  "n71_answer": "Kinthar Grill",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "meal kit (Kelvimeal)",
  "n71_answer": "Kelvimeal",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Vorantel SUV -> Therwyn Compact -> Xylorim Scooter",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'marketplace health plan', but employer changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Kinthar Grill', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "subscription_service",
  "question": "What am I subscribed to?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "commute_method,commute_duration,work_location",
  "question": "Tell me everything you know about my commute.",
  "gold_answer": "The user commutes by subway for 50 min to Belvona Harbor.",
  "n71_answer": "- Commute duration: 50 minutes  \n- Commute method: Xylorim Scooter (latest)  \n- Previous commute method: subway  \n- Commute experience: previously found subway convenient but with occasional delays  \n- Vehicle used: Therwyn Compact (current)  \n- Vehicle model: Vorantel SUV (current)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-021",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "\"We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for kids' education fund",
  "n71_answer": "saving for kids' education fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "9-to-5 fixed",
  "n71_answer": "9-to-5 fixed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "residence_location",
  "question": "Where do I live?",
  "gold_answer": "Belvona Harbor",
  "n71_answer": "Belvona Harbor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "James, Alex, Taylor",
  "n71_answer": "Alex",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "saving for wedding",
  "n71_answer": "saving for kids' education fund",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "Uncertain — previously '9-to-5 fixed', but school changed",
  "n71_answer": "9-to-5 fixed",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "residence_location",
  "question": "Where do I live?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "None",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays tennis, and is part of wine tasting club.",
  "n71_answer": "Stand-up comedy open mics, tennis, woodworking, wine tasting club, potluck dinner group, learning Japanese.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-022",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Caeltura Systems",
  "n71_answer": "Caeltura Systems",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Xylorim Scooter",
  "n71_answer": "Quorvel EV, Zyvanta Sedan, Xylorim Scooter",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "irregular",
  "n71_answer": "irregular",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "home gym",
  "n71_answer": "home gym",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'dentist (every 6 months)', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "None",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays basketball, and is part of book club.",
  "n71_answer": "Playing basketball, playing strategy games.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-023",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "condo",
  "n71_answer": "condo",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "family reunion",
  "n71_answer": "family reunion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip for 1 week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Sarah, Jessica, Taylor",
  "n71_answer": "Sarah, Jessica, Taylor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "townhouse",
  "n71_answer": "townhouse",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'family reunion', but relationship_status changed",
  "n71_answer": "family reunion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays basketball, and is part of hiking club.",
  "n71_answer": "game night with neighbors, pickup games with friends, video games, attending music festivals, hiking, basketball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-024",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "marketing lead",
  "n71_answer": "marketing lead",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "walking",
  "n71_answer": "walking to work",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "relationship_status",
  "question": "What's my relationship status?",
  "gold_answer": "in a relationship",
  "n71_answer": "in a relationship",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "staff engineer",
  "n71_answer": "marketing lead",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "walking to work",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "Uncertain — previously 'hybrid (3 office + 2 remote)', but employer changed",
  "n71_answer": "hybrid (3 office + 2 remote)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Velmara Taqueria', but work_location changed",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "relationship_status",
  "question": "What's my relationship status?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays soccer, and is part of book club.",
  "n71_answer": "soccer, woodworking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-025",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "By bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "weekly tutoring session",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Thyranote -> Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "subway",
  "n71_answer": "Subway",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Thandrel YMCA', but residence_location changed",
  "n71_answer": "Thandrel YMCA",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays pickleball, and is part of cycling club.",
  "n71_answer": "engage in activities that help unwind, pursue interests, cycling, guitar, trivia night on Wednesdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-026",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "keto",
  "n71_answer": "keto",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "\"In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Xelvoria, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "15 min",
  "n71_answer": "10 min",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "Uncertain — previously 'physical therapy (2x/week)', but residence_location changed",
  "n71_answer": "physical therapy (2x/week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays golf, and is part of hiking club.",
  "n71_answer": "Reading fiction short stories",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-027",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "\"In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "staff engineer",
  "n71_answer": "staff engineer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "interim coverage plan",
  "n71_answer": "interim coverage plan",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Zorvista Premium, Dranthazon, Pyrethon Plus",
  "n71_answer": "Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "designer",
  "n71_answer": "staff engineer",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "20 min",
  "n71_answer": "10 min",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'interim coverage plan', but employer changed",
  "n71_answer": "interim coverage plan",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Velmara Taqueria', but work_location changed",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays basketball, and is part of hiking club.",
  "n71_answer": "hiking, gardening, basketball",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-028",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "10-to-7 fixed",
  "n71_answer": "10-to-7 fixed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "maxing out retirement fund",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "11pm-7am",
  "n71_answer": "11pm to 7am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "Whispers of Zarinthea (TV) (2023-03-03) -> Xelvari Dawn (book) (2023-03-15)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "hybrid (3 office + 2 remote)",
  "n71_answer": "10-to-7 fixed",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "Uncertain — previously 'maxing out retirement fund', but school changed",
  "n71_answer": "maxing out retirement fund",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "11pm to 7am",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "Volleyball, reading fiction, reading news online during lunch break, video games, visiting Bali, experiencing Indonesian culture and landscapes.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-029",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "shift work (day)",
  "n71_answer": "shift work (day)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "about 10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer + supplemental",
  "n71_answer": "employer + supplemental",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "remote (no commute)",
  "n71_answer": "About 10 min commute.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "paying off student loans",
  "n71_answer": "paying off student loans",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Quorath Codex (book), Thyrelion (anime), Xelvari Dawn (book)",
  "n71_answer": "video games -> pickleball -> Thyrelion (anime) -> Xelvari Dawn (book)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "work_schedule",
  "question": "What's my work schedule?",
  "gold_answer": "9-to-5 fixed",
  "n71_answer": "shift work (day)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "about 10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "Uncertain — previously 'employer + supplemental', but employer changed",
  "n71_answer": "employer + supplemental",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'remote (no commute)', but work_location changed",
  "n71_answer": "About 10 min commute",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "financial_goal",
  "question": "What's my current financial goal?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is video games, plays pickleball, and is part of cycling club.",
  "n71_answer": "Playing pickleball, cycling, video games, reading books, watching anime, attending game night with neighbors.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-030",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "tutoring session (weekly)",
  "n71_answer": "tutoring session (weekly)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "a true crime podcast, The Quorath Codex (book), Xelvari Dawn (book)",
  "n71_answer": "cooking, cycling, Xelvari Dawn, true crime podcast",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "dentist (every 6 months)",
  "n71_answer": "dentist (every 6 months)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "home gym",
  "n71_answer": "home gym",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously 'midnight-8am', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays cycling, and is part of volunteer group.",
  "n71_answer": "cycling, cooking, game night with neighbors",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-031",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "baby shower",
  "n71_answer": "baby shower",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with roommate",
  "n71_answer": "roommate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "new laptop",
  "n71_answer": "a new laptop",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, Zorvista Premium, meal kit (Kelvimeal)",
  "n71_answer": "Thyranote, Zorvista Premium, Kelvimeal",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "parent's birthday",
  "n71_answer": "parent's birthday",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with roommate', but relationship_status changed",
  "n71_answer": "Unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is reading, plays basketball, and is part of running club.",
  "n71_answer": "Reading, running club, playing basketball (guard position)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-032",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental + vision bundle",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Lorwen & Sage",
  "n71_answer": "Lorwen & Sage",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "infrastructure upgrade",
  "n71_answer": "infrastructure upgrade",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "vegetarian",
  "n71_answer": "vegetarian",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "The Veldrakon Spiral (TV), a daily news podcast, Thyrelion (anime)",
  "n71_answer": "Thyrelion, daily news podcast",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer-provided health plan",
  "n71_answer": "employer-provided health plan",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Velmara Taqueria",
  "n71_answer": "Velmara Taqueria",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'infrastructure upgrade', but employer changed",
  "n71_answer": "Our Project",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '10 min', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "diet_preference",
  "question": "What's my diet?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "vegetarian",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays pickleball, and is part of wine tasting club.",
  "n71_answer": "pickleball, grilling, experimenting with BBQ sauce flavors, making homemade strawberry jam, pottery",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-033",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "running 3x/week",
  "n71_answer": "running 3x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "remote",
  "n71_answer": "remote",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "weight training 4x/week",
  "n71_answer": "weight training 4x/week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Korathril', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays cycling, and is part of wine tasting club.",
  "n71_answer": "cycling, woodworking, pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-034",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "dormitory",
  "n71_answer": "dormitory",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "family reunion",
  "n71_answer": "Family reunion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Zypherix Labs",
  "n71_answer": "Zypherix Labs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Zorvista Premium, meal kit (Kelvimeal), Pyrethon Plus",
  "n71_answer": "Kelvimeal, Zorvista Premium, Pyrethon Plus",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "two-bedroom apartment",
  "n71_answer": "two-bedroom apartment",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'family reunion', but relationship_status changed",
  "n71_answer": "Family Reunion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "You don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is reading, plays running club, and is part of volunteer group.",
  "n71_answer": "Reading, running club",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-035",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "30 min",
  "n71_answer": "30 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "electric car",
  "n71_answer": "electric car",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "James, Jordan, Taylor",
  "n71_answer": "James, Jordan, Taylor",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "10 min",
  "n71_answer": "10 min",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Uncertain — previously 'Zorathel Peak Gym', but residence_location changed",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "planned_purchase",
  "question": "What big purchase am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Electric car",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is drawing/painting, plays running club, and is part of wine tasting club.",
  "n71_answer": "Drawing/painting, game night with neighbors, wine tasting club, running club membership, reading The Quorath Codex (book)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-036",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "child's school event",
  "n71_answer": "Child's school event",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "shared house",
  "n71_answer": "shared house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "Korvath & Co.",
  "n71_answer": "Korvath & Co.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "a daily news podcast, Thyrelion (anime), a history documentary series",
  "n71_answer": "daily news podcast, history documentary series",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "none currently",
  "n71_answer": "Child's school event",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "Uncertain — previously 'shared house', but relationship_status changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "employer",
  "question": "Where do I work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "I don't have that information.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "Volleyball, skiing, gardening",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-037",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "cross-country road trip (2 weeks)",
  "n71_answer": "Cross-country road trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "sibling's wedding",
  "n71_answer": "Sibling's wedding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Thornvale Crossing",
  "n71_answer": "Thornvale Crossing",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Quorvel EV, Vorantel SUV",
  "n71_answer": "Quorvel EV, Vorantel SUV, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "Cross-country road trip (2 weeks)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'sibling's wedding', but relationship_status changed",
  "n71_answer": "sibling's wedding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Thornvale Crossing",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays running club, and is part of cycling club.",
  "n71_answer": "photography, running",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-038",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Spanish",
  "n71_answer": "Spanish",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "By bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no caffeine",
  "n71_answer": "no caffeine",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "\"Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "media_consumption",
  "question": "List all media I've consumed over time, from earliest to latest.",
  "gold_answer": "Whispers of Zarinthea (TV), a true crime podcast, Xelvari Dawn (book)",
  "n71_answer": "Whispers of Zarinthea (TV), Xelvari Dawn (book)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "skill_acquisition",
  "question": "What am I currently learning?",
  "gold_answer": "Japanese",
  "n71_answer": "Spanish",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'bike', but school changed",
  "n71_answer": "By bike",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "no caffeine",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays soccer, and is part of running club.",
  "n71_answer": "knitting, playing soccer, attending trivia night on Wednesdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-039",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "\"Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.\"",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bus",
  "n71_answer": "taking the bus",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Quorvel EV, Vorantel SUV",
  "n71_answer": "Zyvanta Sedan, Quorvel EV, Vorantel SUV",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "nut-free",
  "n71_answer": "nut-free",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "You don't have a commute method currently set.",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays volleyball, and is part of cycling club.",
  "n71_answer": "- Volleyball  \n- Collecting rare items  \n- Woodworking  \n- Watching \"Whispers of Zarinthea\" (TV)  \n- Watching \"Servant\" on Apple TV+  \n- Watching \"The Crown\" on Netflix  \n- Game night with neighbors",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-040",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Korathril",
  "n71_answer": "Korathril",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "yoga 2x/week",
  "n71_answer": "yoga 2x/week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "hiking trip out west",
  "n71_answer": "California Coast Road Trip",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Dranthis Pickup, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Dranthis Pickup, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Dyntheral",
  "n71_answer": "Korathril",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'yoga 2x/week', but health_condition changed",
  "n71_answer": "stretching della bandelletta ileotibiale",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "California Coast Road Trip",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is photography, plays pickleball, and is part of book club.",
  "n71_answer": "Photography, pickleball, sugar-free baking",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-041",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "n71_answer": "In three words I can sum up everything I've learned about life: it goes on. You'll have bad times, but it will always wake you up to the stuff you weren't paying attention to.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Velnorath",
  "n71_answer": "Velnorath",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "Zynthera Bay beach trip (1 week)",
  "n71_answer": "Zynthera Bay beach trip for 1 week",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Zyvanta Sedan, Therwyn Compact, Xylorim Scooter",
  "n71_answer": "Zyvanta Sedan, Xylorim Scooter, Therwyn Compact",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "swimming 2x/week",
  "n71_answer": "swimming 2x/week",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "none",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "medication",
  "question": "What medication am I taking?",
  "gold_answer": "Uncertain — previously 'Velnorath', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Zynthera Bay beach trip (1 week)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays volleyball, and is part of running club.",
  "n71_answer": "play volleyball, practice guitar, cook at home",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-042",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "n71_answer": "Twenty years from now you will be more disappointed by the things that you didn't do than by the ones you did do. So throw off the bowlines, sail away from safe harbor, catch the trade winds in your sails. Explore, dream, discover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "no travel planned",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "baby shower",
  "n71_answer": "baby shower",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "chiropractor (biweekly)",
  "n71_answer": "chiropractor every two weeks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Thyranote, Zorvista Premium",
  "n71_answer": "Xelvoria, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "island getaway",
  "n71_answer": "island getaway",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "Uncertain — previously 'baby shower', but relationship_status changed",
  "n71_answer": "baby shower",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays rock climbing, and is part of book club.",
  "n71_answer": "rock climbing, knitting, pickup basketball on Saturdays",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-043",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "varies by shift",
  "n71_answer": "varies by shift",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "walking daily",
  "n71_answer": "walking daily",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "game night with neighbors",
  "n71_answer": "game night with neighbors",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Zorvista Premium, meal kit (Kelvimeal)",
  "n71_answer": "Kelvimeal, Xelvoria",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "1am-9am",
  "n71_answer": "1am-9am",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Thandrel YMCA",
  "n71_answer": "Velthari Studio",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "exercise_routine",
  "question": "What's my exercise routine?",
  "gold_answer": "Uncertain — previously 'walking daily', but health_condition changed",
  "n71_answer": "yoga 2x/week",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "game night with neighbors",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is knitting, plays basketball, and is part of running club.",
  "n71_answer": "knitting, basketball, running, experimenting with sourdough bread flavor combinations, watching a baking competition show on Netflix",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-044",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "mountain ski trip",
  "n71_answer": "mountain ski trip",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "with partner and child",
  "n71_answer": "partner and child",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no alcohol",
  "n71_answer": "no alcohol",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Zorvista Premium, Daily Journal subscription",
  "n71_answer": "Daily Journal subscription, Zorvista Premium",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "travel_plan",
  "question": "What trip am I planning?",
  "gold_answer": "no travel planned",
  "n71_answer": "mountain ski trip",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "living_arrangement",
  "question": "Who do I live with?",
  "gold_answer": "Uncertain — previously 'with partner and child', but relationship_status changed",
  "n71_answer": "James",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "no alcohol",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is cooking, plays volleyball, and is part of wine tasting club.",
  "n71_answer": "cooking, volleyball, wine tasting club",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-045",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "n71_answer": "Most people never run far enough on their first wind to find out they've got a second. Give your dreams all you've got and you'll be amazed at the energy that comes out of you.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "dairy-free",
  "n71_answer": "dairy-free",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "11pm-5:30am (early riser)",
  "n71_answer": "11pm-5:30am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "school",
  "question": "Where am I studying?",
  "gold_answer": "Quivira Springs Community College",
  "n71_answer": "Quivira Springs Community College",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "Xelvoria, Dranthazon",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "nut-free",
  "n71_answer": "nut-free",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Crysthene Pool",
  "n71_answer": "Crysthene Pool",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously '11pm-5:30am (early riser)', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "school",
  "question": "Where am I studying?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Quivira Springs Community College",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is pottery, plays pickleball, and is part of book club.",
  "n71_answer": "Pickleball, pottery",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-046",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "designer",
  "n71_answer": "designer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1 hour",
  "n71_answer": "1 hour",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Tesswick Bay",
  "n71_answer": "Tesswick Bay",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "bike",
  "n71_answer": "bike",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "pickup basketball Saturdays",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "vehicle",
  "question": "List all vehicles I've had over time, from earliest to latest.",
  "gold_answer": "Therwyn Compact, Vorantel SUV, Xylorim Scooter",
  "n71_answer": "Therwyn Compact, Vorantel SUV",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "job_title",
  "question": "What is my job title?",
  "gold_answer": "product manager",
  "n71_answer": "product manager",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "1.5 hours",
  "n71_answer": "1.5 hours",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_location",
  "question": "Where is my office?",
  "gold_answer": "Uncertain — previously 'Tesswick Bay', but employer changed",
  "n71_answer": "Quivira Springs",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_method",
  "question": "How do I get to work?",
  "gold_answer": "Uncertain — previously 'bike', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "social_gathering",
  "question": "What's my regular social meetup?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is gardening, plays basketball, and is part of book club.",
  "n71_answer": "collecting vintage video games, collecting Pokémon cards, basketball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-047",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "n71_answer": "We are what we repeatedly do. Excellence, then, is not an act, but a habit. The whole is greater than the sum of its parts. We must become the change we wish to see in the world.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "low-sugar",
  "n71_answer": "low-sugar",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "midnight-8am",
  "n71_answer": "midnight-8am",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "physical therapy (2x/week)",
  "n71_answer": "physical therapy",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "partner",
  "question": "List all partners I've had over time, from earliest to latest.",
  "gold_answer": "Jessica, Alex, Chris",
  "n71_answer": "Jessica, Alex, Chris",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "dietary_restriction",
  "question": "What dietary restrictions do I have?",
  "gold_answer": "no processed food",
  "n71_answer": "no processed food",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "sleep_pattern",
  "question": "What's my sleep schedule?",
  "gold_answer": "Uncertain — previously 'midnight-8am', but health_condition changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "regular_appointment",
  "question": "What regular appointments do I have?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays cycling, and is part of cycling club.",
  "n71_answer": "Cycling, woodworking, attending trivia night Wednesdays, watching the TV show Whispers of Zarinthea.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-048",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "employer + supplemental",
  "n71_answer": "employer + supplemental",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Velthari Studio",
  "n71_answer": "Velthari Studio",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "mobile app development",
  "n71_answer": "mobile app development",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Zefiro's Kitchen",
  "n71_answer": "Zefiro's Kitchen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "anniversary dinner",
  "n71_answer": "anniversary dinner",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "\"The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.\"",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Xelvoria, Dranthazon, Pyrethon Plus",
  "n71_answer": "Xelvoria",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "insurance",
  "question": "What insurance do I have?",
  "gold_answer": "dental + vision bundle",
  "n71_answer": "dental + vision bundle",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Kaelstrom Fitness",
  "n71_answer": "Kaelstrom Fitness",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "work_project",
  "question": "What project am I working on?",
  "gold_answer": "Uncertain — previously 'mobile app development', but employer changed",
  "n71_answer": "InDesign Cover Project",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": "reasoning_failure"
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "favorite_restaurant",
  "question": "What's my favorite restaurant?",
  "gold_answer": "Uncertain — previously 'Zefiro's Kitchen', but work_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "family_event",
  "question": "What family event do I have coming up?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is guitar, plays golf, and is part of book club.",
  "n71_answer": "fantasy football",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-049",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "n71_answer": "The only way to do great work is to love what you do. If you haven't found it yet, keep looking. Don't settle. As with all matters of the heart, you'll know when you find it.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Pyravar CrossFit",
  "n71_answer": "Pyravar CrossFit",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "50 min",
  "n71_answer": "50 min",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "Del",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "single-family house",
  "n71_answer": "single-family house",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "before",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Tr",
  "entity": "subscription_service",
  "question": "List all subscription services I've had over time, from earliest to latest.",
  "gold_answer": "Thyranote, meal kit (Kelvimeal), Dranthazon",
  "n71_answer": "Dranthazon, Kelvimeal",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Cas",
  "entity": "fitness_facility",
  "question": "Where do I work out?",
  "gold_answer": "Zorathel Peak Gym",
  "n71_answer": "Zorathel Peak Gym",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Abs",
  "entity": "commute_duration",
  "question": "How long is my commute?",
  "gold_answer": "Uncertain — previously '50 min', but residence_location changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Del",
  "entity": "housing_type",
  "question": "What type of home do I live in?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "single-family house",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "Agg",
  "entity": "hobby,sports,club_membership",
  "question": "What do I do in my free time? List everything you know.",
  "gold_answer": "The user's hobby is woodworking, plays basketball, and is part of volunteer group.",
  "n71_answer": "woodworking, cooking, baking, basketball",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "pl-050",
  "domain": "pl",
  "phase": "after",
  "task": "ER",
  "entity": "life_philosophy",
  "question": "Recite my life philosophy verbatim.",
  "gold_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "n71_answer": "Try not to become a man of success, but rather try to become a man of value. Look around at how people want to get more out of life than they put in. A man of value will give more than he receives.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Zelthra Pack",
  "n71_answer": "Zelthra Pack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "zelthra pack --sourcemap",
  "n71_answer": "zelthra pack --sourcemap",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "single-app with /src and /lib",
  "n71_answer": "single-app with /src and /lib",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "tarvex verify --integration",
  "n71_answer": "tarvex verify --integration",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "github.com/aurora-team/docs",
  "n71_answer": "github.com/aurora-team/docs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 May 23, 2025 June 20",
  "n71_answer": "2025 April 11, 2025 May 23, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Quorix Forge",
  "n71_answer": "Zelthra Pack",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "quorix forge --target prod",
  "n71_answer": "zelthra pack --sourcemap",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "Uncertain — previously 'single-app with /src and /lib', but framework changed",
  "n71_answer": "single-app with /src and /lib",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'tarvex verify --integration', but test_framework changed",
  "n71_answer": "tarvex verify --integration",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "github.com/aurora-team/docs",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Monday 11am, uses #aurora-dev for chat, and standups are async written updates in Slack.",
  "n71_answer": "- The team coordinates using async written updates in Slack for standups.  \n- The team channel for discussions and project updates is #aurora-dev.  \n- Team meetings are scheduled every Monday at 11am with an agenda focused on progress and blockers.  \n- The on-call rotation is volunteer-based with weekly signup, encouraging ownership among team members.  \n- For smooth on-call transitions, a handoff process with shared documents or checklists is used to transfer knowledge.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-001",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.korinth",
  "n71_answer": "deploy/Dockerfile.korinth",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier production",
  "n71_answer": "orinthal publish --tier production",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "develop",
  "n71_answer": "develop",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 April 25, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "korinth-deploy.yaml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.zephyra",
  "n71_answer": "deploy/Dockerfile.korinth",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'stream.velturis.io/aurora', but deploy_target changed",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "Uncertain — previously 'orinthal publish --tier production', but ci_config changed",
  "n71_answer": "orinthal publish --tier production",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "develop",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 90% for core modules test coverage, and require lead approval for core/ for reviews.",
  "n71_answer": "Lead approval for the core/ directory",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-002",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "sandbox.thandrel.io/aurora",
  "n71_answer": "sandbox.thandrel.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/zephyra/Dockerfile",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "team_lead",
  "question": "Who's the team lead?",
  "gold_answer": "Sarah Chen",
  "n71_answer": "Sarah Chen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Quorix Hub, Synthari Modules",
  "n71_answer": "Quorix Hub -> Synthari Modules",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "sink.orinthal.dev/aurora",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'sandbox.thandrel.io/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/zephyra/Dockerfile', but docker_image changed",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "team_lead",
  "question": "Who's the team lead?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Sarah Chen",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use trunk-based with feature flags branching on production, releasing monthly first Monday.",
  "n71_answer": "- Release schedule: monthly on the first Monday  \n- Default branch: production  \n- Test coverage target: 90% for core modules  \n- Deploy command: currently \"thandrel ship --channel beta\" (depends on CI config \"thandrel-flow.toml\")  \n- CI config: \"thandrel-flow.toml\" (depends on infrastructure and deploy targets)  \n- Package manager: Synthari Modules (current)  \n- Build tool: Python (current)  \n- Monitoring dashboard URL: observe.velturis.io/aurora  \n- Incident channel: #zenith-incidents  \n- Infrastructure: Thandrel Infra (latest)  \n- On-call rotation: follow-the-sun across timezones  \n- Authentication and authorization services: Amazon Cognito user pools  \n- Database: SQLite3  \n- API integration: OpenAI GPT API  \n- Team lead: Sarah Chen  \n- Status: Final stages (published, featured, archived)  \n- Logs drain: unknown currently; previous \"drain.zephyra.io/proj-42\" may be invalid due to infrastructure changes  \n- Docker image: unknown currently; previous \"zephyra-hub/aurora:prod\" may be invalid due to infrastructure changes  \n- Staging URL: unknown currently; previous \"sandbox.thandrel.io/aurora\" may be invalid due to infrastructure changes  \n- Dockerfile path: unknown currently; previous \"infra/zephyra/Dockerfile\" may be invalid due to docker image changes  \n- Deploy command changes if CI config changes; if deploy targets change, CI config changes to \"narvex-ci.yml\" and deploy command to \"narvex deploy --env prod\" (previous value may be invalid)  \n- Release process tips include automating testing and deployment, creating checklists for code reviews, merging, documentation updates, and holding sync meetings before releases.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-003",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "David Lee",
  "n71_answer": "David Lee",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Taehyun Kim",
  "n71_answer": "Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "test_coverage_target",
  "question": "What's our test coverage target?",
  "gold_answer": "90% for core modules",
  "n71_answer": "90%",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 April 11, 2025 June 6",
  "n71_answer": "2025 March 28, 2025 April 11, 2025 June 6",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Uncertain — previously 'Taehyun Kim', but team_lead changed",
  "n71_answer": "Taehyun Kim",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "test_coverage_target",
  "question": "What's our test coverage target?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "Our Project — on_call_roles: primary and a secondary oncall (current)  \nOur Project — on_call_rotation_type: paired oncall (current)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-004",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "monitor.narvex.io/proj-aurora",
  "n71_answer": "monitor.narvex.io/proj-aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/zephyra/Dockerfile",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "meeting_day",
  "question": "When's our team meeting?",
  "gold_answer": "every Thursday 2pm",
  "n71_answer": "every Thursday at 2pm",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 23, 2025 June 6",
  "n71_answer": "2025 April 25, 2025 May 23, 2025 June 6",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "monitor.narvex.io/proj-aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'dev.orinthal.dev/aurora', but deploy_target changed",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/zephyra/Dockerfile', but docker_image changed",
  "n71_answer": "infra/zephyra/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "meeting_day",
  "question": "When's our team meeting?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Every Thursday at 2pm",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at wiki.aurora.dev/getting-started, and changelog at GitHub Releases page.",
  "n71_answer": "confluence.aurora.dev/aurora",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-005",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "James Lee",
  "n71_answer": "James Lee",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "alert_channel",
  "question": "What's our alert channel?",
  "gold_answer": "#aurora-alerts",
  "n71_answer": "#aurora-alerts",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Pyronos Crypt",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Helen Chen",
  "n71_answer": "Helen Chen",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "alert_channel",
  "question": "What's our alert channel?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Friday 9:30am, uses #aurora-dev for chat, and standups are async written updates in Slack.",
  "n71_answer": "Our team coordinates using async written updates in Slack as the standup format.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-006",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Crysthene ORM",
  "n71_answer": "Crysthene ORM",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Crysthene.define('Model', {fields})",
  "n71_answer": "Crysthene.define('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "zyndra://base-main:9042/aurora",
  "n71_answer": "zyndra://base-main:9042/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Verisigil Lock, Sigilix Manager",
  "n71_answer": "Cryptara Vault -> Sigilix Manager",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Thalnex Query",
  "n71_answer": "Thalnex Query",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'zyndra://base-main:9042/aurora', but database changed",
  "n71_answer": "zyndra://base-main:9042/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is weekly Mon-Sun rotation, alerts go to #aurora-oncall-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "weekly Mon-Sun rotation",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-007",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Verithos-JWT-v2",
  "n71_answer": "Verithos-JWT-v2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/auth/kryptal/signin",
  "n71_answer": "/\tauth/kryptal/signin",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "watch.korinth.dev/aurora",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 9, 2025 June 20",
  "n71_answer": "2025 April 25, 2025 May 9, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Oathkeeper-mTLS-cert",
  "n71_answer": "Verithos-JWT-v2",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "Uncertain — previously '/auth/kryptal/signin', but auth_provider changed",
  "n71_answer": "/ auth/kryptal/signin",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitFlow with release branches branching on release, releasing sprint-end release every 2 weeks.",
  "n71_answer": "- The team uses GitFlow with release branches branching as the branching strategy.  \n- The default branch is release, where all stable code lives before being pushed to production.  \n- The team releases a sprint-end release every 2 weeks, maintaining a regular release cadence.  \n- Sprint deadlines have been updated over time, with the latest deadline set for 2025 June 20.  \n- The sprint release cadence is every 2 weeks.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-008",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "korinth-deploy.yaml",
  "n71_answer": "korinth-deploy.yaml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --canary",
  "n71_answer": "korinth release --canary",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "velturis-cr/aurora:main",
  "n71_answer": "velturis-cr/aurora:main",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.velturis",
  "n71_answer": "deploy/Dockerfile.velturis",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "changelog_location",
  "question": "Where's the changelog?",
  "gold_answer": "CHANGELOG.md in repo root",
  "n71_answer": "CHANGELOG.md in the repo root",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Pyronis Bundle, Quorix Hub",
  "n71_answer": "Karnex Pack -> Pyronis Bundle -> Quorix Hub",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "zephyra-pipeline.json",
  "n71_answer": "zephyra-pipeline.json",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target preview",
  "n71_answer": "zephyra push --target preview",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'velturis-cr/aurora:main', but deploy_target changed",
  "n71_answer": "velturis-cr/aurora:main",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.velturis', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.velturis",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "changelog_location",
  "question": "Where's the changelog?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "CHANGELOG.md in repo root",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target no strict target, PR-reviewed test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "Our Project — test_coverage_target: no strict target; it's PR-reviewed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-009",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "zephyra-pipeline.json",
  "n71_answer": "zephyra-pipeline.json",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target preview",
  "n71_answer": "zephyra push --target preview",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.zephyra",
  "n71_answer": "deploy/Dockerfile.zephyra",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Kerberos Trove, Orinthal Sealed",
  "n71_answer": "Verisigil Lock -> Kerberos Trove -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "thandrel-flow.toml",
  "n71_answer": "zephyra-pipeline.json",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "zephyra push --target preview",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'zephyra-hub/aurora:prod', but deploy_target changed",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.zephyra', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.zephyra",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on main, releasing monthly first Monday.",
  "n71_answer": "monthly on the first Monday",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-010",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "korvel dev --open",
  "n71_answer": "korvel dev --open",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --watch",
  "n71_answer": "karnex test --watch",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Karnex Bundler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --analyze",
  "n71_answer": "karnex build --analyze",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "n71_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "moxira serve --hot",
  "n71_answer": "moxira serve --hot",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "quorix spec --run smoke",
  "n71_answer": "quorix spec --run smoke",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'karnex build --analyze', but build_tool changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target no strict target, PR-reviewed test coverage, and require lead approval for core/ for reviews.",
  "n71_answer": "- code_review_policy: lead approval for the core/ directory  \n- test_coverage_target: no strict target; PR-reviewed",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-011",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Pyravar Mapper",
  "n71_answer": "Pyravar Mapper",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.entity('Model', attrs)",
  "n71_answer": "Pyravar.entity('Model', attrs)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Zyndra Alter",
  "n71_answer": "Zyndra Alter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Pyronos Crypt",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Vorathis ODM",
  "n71_answer": "Pyravar Mapper",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Vorathis.collection('Model', spec)",
  "n71_answer": "Pyravar.entity('Model', attrs)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Uncertain — previously 'Zyndra Alter', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at notion.so/aurora-team/new-hire, and changelog at GitHub Releases page.",
  "n71_answer": "aurora-docs.readthedocs.io",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-012",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "15 minutes",
  "n71_answer": "15 minutes",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "package_manager",
  "question": "What package manager do we use?",
  "gold_answer": "Dranith Stack",
  "n71_answer": "Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 May 23, 2025 June 20, 2025 July 4",
  "n71_answer": "2025 May 23, 2025 June 20, 2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "15 minutes",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-cert-check v3.2",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'SAML 2.0 SSO', but auth_provider changed",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "package_manager",
  "question": "What package manager do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-oncall-alerts, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "paired oncall (primary + secondary)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-013",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "CTO Soojung Lim",
  "n71_answer": "CTO Soojung Lim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "thandrel ship --channel beta",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Kerberos Trove",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Kerberos Trove",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Rachel Yoo",
  "n71_answer": "Kevin Park",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "Uncertain — previously 'CTO Soojung Lim', but team_lead changed",
  "n71_answer": "CTO Soojung Lim",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Dravex Blocks v2.4, target no strict target, PR-reviewed test coverage, and require 2 approvals required for reviews.",
  "n71_answer": "- 2 approvals for a pull request to be merged",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-014",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "status.orinthal.dev/aurora",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "preview.zephyra.io/aurora",
  "n71_answer": "preview.zephyra.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.orinthal",
  "n71_answer": "deploy/Dockerfile.orinthal",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Pyronis Build",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Zelthra Lib, Dranith Stack",
  "n71_answer": "Tarvex Mod -> Zelthra Lib -> Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "metrics.thandrel.io/aurora-prod",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel beta",
  "n71_answer": "narvex deploy --env prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'preview.zephyra.io/aurora', but deploy_target changed",
  "n71_answer": "preview.zephyra.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'deploy/Dockerfile.orinthal', but docker_image changed",
  "n71_answer": "deploy/Dockerfile.orinthal",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-015",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "korvel dev --open",
  "n71_answer": "korvel dev --open",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "pyronis pack --optimize",
  "n71_answer": "pyronis pack --optimize",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Karnex Bundler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "pyronis check --snapshot",
  "n71_answer": "pyronis check --snapshot",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Zelthra Lib, Dranith Stack, Veltrion Pkg",
  "n71_answer": "Zelthra Lib, Dranith Stack, Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dev_server_command",
  "question": "How do I run the dev server?",
  "gold_answer": "veltrion dev --port 3000",
  "n71_answer": "veltrion dev --port 3000",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --debug",
  "n71_answer": "tarvex compile --debug",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "Tarvex CLI",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'pyronis check --snapshot', but test_framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitFlow with release branches branching on main, releasing sprint-end release every 2 weeks.",
  "n71_answer": "Release cadence: every 2 weeks  \nBranching strategy: GitFlow with release branches branching  \nCode review policy: 2 approvals + security review for auth/  \nDefault branch: main",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-016",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "API Key + HMAC",
  "n71_answer": "API Key + HMAC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/zenthos/id/authenticate",
  "n71_answer": "/zenthos/id/authenticate",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "framework",
  "question": "What framework are we using?",
  "gold_answer": "Moxira",
  "n71_answer": "React",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 May 9, 2025 May 23",
  "n71_answer": "2025 April 11, 2025 May 9, 2025 May 23",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "API Key + HMAC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "cerberix-sso-bridge v1.9",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "Uncertain — previously '/zenthos/id/authenticate', but auth_provider changed",
  "n71_answer": "/zenthos/id/authenticate",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "framework",
  "question": "What framework are we using?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #vortex-alerts, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "paired oncall (primary and secondary)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-017",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "crysthene://replica-db:5432/aurora-ro",
  "n71_answer": "crysthene://replica-db:5432/aurora-ro",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.object('Model', props)",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "biweekly Friday afternoon",
  "n71_answer": "biweekly on Friday afternoons",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Zelthra Lib, Quorix Hub, Dranith Stack",
  "n71_answer": "Zelthra Lib, Quorix Hub, Dranith Stack",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "vorathis://cluster-main:27017/aurora",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "biweekly on Friday afternoons",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-018",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "observe.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Veltrion Pkg, Synthari Modules",
  "n71_answer": "Karnex Pack -> Veltrion Pkg -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target prod",
  "n71_answer": "zephyra push --target prod",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "Uncertain — previously 'observe.velturis.io/aurora', but deploy_target changed",
  "n71_answer": "observe.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "Volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-019",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "watch.korinth.dev/aurora",
  "n71_answer": "watch.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.thandrel",
  "n71_answer": "deploy/Dockerfile.thandrel",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 11, 2025 June 20, 2025 July 4",
  "n71_answer": "2025 April 11  \n2025 June 20  \n2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "observe.velturis.io/aurora",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "deploy/Dockerfile.korinth",
  "n71_answer": "deploy/Dockerfile.thandrel",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "Uncertain — previously 'zephyra-hub/aurora:prod', but deploy_target changed",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "Uncertain — previously 'thandrel ship --channel stable', but ci_config changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at notion.so/aurora-team/docs, onboarding guide at ONBOARDING.md in repo root, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "notion.so/aurora-team/docs",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-020",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Rachel Yoo",
  "n71_answer": "Rachel Yoo",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Taehyun Kim",
  "n71_answer": "Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "runbooks.aurora.internal/playbooks",
  "n71_answer": "runbooks.aurora.internal/playbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Synthari Modules",
  "n71_answer": "Quorix Hub -> Synthari Modules -> Dranith Stack",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Seungmin Oh",
  "n71_answer": "Seungmin Oh",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Uncertain — previously 'Taehyun Kim', but team_lead changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "runbooks.aurora.internal/playbooks",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at github.com/aurora-team/onboarding, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "confluence.aurora.dev/aurora",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-021",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "continuous deploy on merge",
  "n71_answer": "Continuously on merge",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 April 25, 2025 May 9, 2025 July 4",
  "n71_answer": "2025 April 25, 2025 May 9, 2025 July 4",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "1 hour",
  "n71_answer": "4 hours",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "verithos-oauth-middleware v2.1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Cerberix-SAML-assertion', but auth_provider changed",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "release_cadence",
  "question": "How often do we release?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #aurora-pager, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-022",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Helen Chen",
  "n71_answer": "VP Helen Chen",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "database",
  "question": "What database are we using?",
  "gold_answer": "Thalnex Store",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Zelthra Lib, Synthari Modules",
  "n71_answer": "Zelthra Lib -> Pyronis Bundle -> Synthari Modules",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Taehyun Kim",
  "n71_answer": "VP Helen Chen",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "database",
  "question": "What database are we using?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "The team's on-call rotation is paired oncall (primary + secondary). This way, there is someone to back up the primary in case things get hectic.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-023",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "follow-the-sun across timezones",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Quorix Hub, Veltrion Pkg",
  "n71_answer": "Karnex Pack -> Quorix Hub -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "JWT Bearer",
  "n71_answer": "JWT Bearer",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '4 hours', but auth_provider changed",
  "n71_answer": "4 hours",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on trunk, releasing sprint-end release every 2 weeks.",
  "n71_answer": "sprint-end releases every 2 weeks",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-024",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "github.com/aurora-team/runbooks",
  "n71_answer": "github.com/aurora-team/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 March 28, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Jiwon Han",
  "n71_answer": "Jiwon Han",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "github.com/aurora-team/runbooks",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use ship/show/ask branching branching on main, releasing sprint-end release every 2 weeks.",
  "n71_answer": "- The team releases sprint-end releases every 2 weeks.  \n- The release cadence is every 2 weeks (current).  \n- This approach keeps things fresh and allows for quick iteration.  \n- For release planning, it is helpful to establish clear goals for each sprint to prioritize work.  \n- Using visual task management tools enhances communication and tracking by showing real-time progress and allowing adjustments.  \n- Regular check-ins help keep the team aligned and address roadblocks promptly.  \n- As deadlines approach, prioritizing tasks based on impact and urgency ensures critical items are tackled first.  \n- Frequent reviews or stand-ups are beneficial to monitor progress and discuss potential roadblocks.  \n- Tools like Kanban boards or Gantt charts can visually represent task status and timelines to keep everyone on track.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-025",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "monorepo with /apps and /packages",
  "n71_answer": "layered with /domain, /infra, /api",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --release",
  "n71_answer": "karnex build --analyze",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Karnex Bundler",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --coverage",
  "n71_answer": "karnex test --coverage",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "docs.aurora.internal/wiki",
  "n71_answer": "docs.aurora.internal/wiki",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Pyronis Bundle, Veltrion Pkg",
  "n71_answer": "Karnex Pack, Pyronis Bundle, Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "layered with /domain, /infra, /api",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --analyze",
  "n71_answer": "tarvex compile --release",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Pyronis Build', but framework changed",
  "n71_answer": "Veltrion Pkg",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'karnex test --coverage', but test_framework changed",
  "n71_answer": "karnex test --coverage",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "documentation_url",
  "question": "Where are our docs?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is follow-the-sun across timezones, alerts go to #zenith-incidents, and runbook is at notion.so/aurora-team/incidents.",
  "n71_answer": "follow-the-sun across timezones",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-026",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "MEME sw026 session013.md: # Conversation session â 2023/03/19 (Sun) 17:33\n(session id: sharegpt_24708)\n\n**User:** can you give me tips on to program a plc PLC Course w/ Motion Control Kinetix Allen Bradle, the plc allen bradley Motion Control rockwell automation with the most advanced features knows on the planet as of right now\n\n**Assistant:** <div class=\"markdown prose w-full break-words dark:prose-invert light\"><p>Here are some tips for programming a PLC with Allen Bradley Motion Control:</",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.graph('Model', edges)",
  "n71_answer": "Thalnex.graph('Model', edges)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "weekly Sunday 2am UTC",
  "n71_answer": "Weekly Sunday 2am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "zephyra-hub/aurora:prod",
  "n71_answer": "zephyra-hub/aurora:prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Orinthal Sealed",
  "n71_answer": "Cryptara Vault -> Selvex Secrets -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Zyndra.struct('Model', types)",
  "n71_answer": "Thalnex.graph('Model', edges)",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'weekly Sunday 2am UTC', but database changed",
  "n71_answer": "weekly Sunday 2am UTC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none — explicitly removed",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #vortex-alerts, and runbook is at docs.aurora.internal/runbooks.",
  "n71_answer": "paired oncall",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-027",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "korinth-repo/aurora:stable",
  "n71_answer": "korinth-repo/aurora:stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "drain.zephyra.io/proj-42",
  "n71_answer": "drain.zephyra.io/proj-42",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "volunteer-based with weekly signup",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Quorix Hub, Veltrion Pkg",
  "n71_answer": "Tarvex Mod, Quorix Hub, Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "thandrel-images/aurora:release",
  "n71_answer": "korinth-repo/aurora:stable",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "zephyra push --target prod",
  "n71_answer": "korinth release --prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'drain.zephyra.io/proj-42', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "oncall_rotation",
  "question": "What's our on-call rotation?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "volunteer-based with weekly signup",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use GitHub Flow with PR reviews branching on release, releasing monthly first Monday.",
  "n71_answer": "Our Project — release_cadence: monthly on the first Monday (current)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-028",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.object('Model', props)",
  "n71_answer": "Keldaris.object('Model', props)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "pipe.thandrel.io/aurora-logs",
  "n71_answer": "pipe.thandrel.io/aurora-logs",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Sigilix Manager, Pyronos Crypt",
  "n71_answer": "Kerberos Trove -> Pyronos Crypt -> Sigilix Manager",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-prod:8529/aurora",
  "n71_answer": "thalnex://store-prod:8529/aurora",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at docs.aurora.internal/wiki, onboarding guide at notion.so/aurora-team/new-hire, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "docs.aurora.internal/wiki",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-029",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Keldaris Link",
  "n71_answer": "Keldaris Link",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.node('Model', shape)",
  "n71_answer": "Keldaris.node('Model', shape)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "trunk",
  "n71_answer": "trunk",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Zelthra Lib, Veltrion Pkg",
  "n71_answer": "Tarvex Mod -> Zelthra Lib -> Veltrion Pkg",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "orm_library",
  "question": "What ORM do we use?",
  "gold_answer": "Thalnex Query",
  "n71_answer": "Thalnex Query",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'pyravar://data-primary:3306/aurora', but database changed",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "default_branch",
  "question": "What's our default branch?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Veltrion UI Kit v2, target 90% for core modules test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "- test_coverage_target: 90% for core modules  \n- code_review_policy: 2 approvals + a security review for the auth/ directory",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-030",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Zenthos-Session-v1",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Crysthene.define('Model', {fields})",
  "n71_answer": "Crysthene.define('Model', {fields})",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Sigilix Manager, Velthri Keys",
  "n71_answer": "Sigilix Manager -> Cryptara Vault -> Velthri Keys",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "zenthos-session-middleware v1.8",
  "n71_answer": "zenthos-session-middleware v1.8",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Zenthos-Session-v1', but auth_provider changed",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at docs.aurora.internal/wiki, onboarding guide at wiki.aurora.dev/getting-started, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "docs.aurora.internal/wiki",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-031",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Cerberix-SAML-assertion",
  "n71_answer": "Cerberix-SAML-assertion",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "kryptal-bearer-check v2.2",
  "n71_answer": "kryptal-bearer-check v2.2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "8 hours",
  "n71_answer": "8 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Quorix Spec",
  "n71_answer": "Quorix Spec",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Sigilix Manager, Velthri Keys, Orinthal Sealed",
  "n71_answer": "Sigilix Manager -> Orinthal Sealed -> Velthri Keys",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Oathkeeper-mTLS-cert",
  "n71_answer": "Oathkeeper-mTLS-cert",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-pkce-guard v1.4",
  "n71_answer": "verithos-pkce-guard v1.4",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '8 hours', but auth_provider changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-alerts, and runbook is at runbooks.aurora.internal/playbooks.",
  "n71_answer": "Our Project — on_call_rotation_type: paired oncall (current)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-032",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "Hyunwoo Nam",
  "n71_answer": "Hyunwoo Nam",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Dranith Stack, Synthari Modules",
  "n71_answer": "Karnex Pack -> Dranith Stack -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "weekly_report_recipient",
  "question": "Who receives the weekly report?",
  "gold_answer": "James Lee",
  "n71_answer": "James Lee",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is paired oncall (primary + secondary), alerts go to #aurora-pager, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "paired oncall (primary and secondary)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-033",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Pyronis Build",
  "n71_answer": "Dranith Compiler",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "pyronis pack --quick",
  "n71_answer": "dranith compile --emit bundle",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Tarvex Verify",
  "n71_answer": "Tarvex Verify",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "tarvex verify --all",
  "n71_answer": "tarvex verify --all",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "every 12 hours with 7-day retention",
  "n71_answer": "every 12 hours",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Cryptara Vault, Selvex Secrets, Velthri Keys",
  "n71_answer": "Cryptara Vault, Selvex Secrets, Velthri Keys",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Dranith Compiler",
  "n71_answer": "Dranith Compiler",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "dranith compile --emit bundle",
  "n71_answer": "dranith compile --emit bundle",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Uncertain — previously 'Tarvex Verify', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "Uncertain — previously 'tarvex verify --all', but test_framework changed",
  "n71_answer": "tarvex verify --all",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "trivial",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Veltrion UI Kit v2, target 80% minimum overall test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "80% minimum overall test coverage",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-034",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at midnight KST",
  "n71_answer": "daily at midnight KST",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Zyndra Alter",
  "n71_answer": "Zyndra Alter",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "orinthal-build.yml",
  "n71_answer": "orinthal-build.yml",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Karnex Pack, Tarvex Mod, Zelthra Lib",
  "n71_answer": "Karnex Pack, Tarvex Mod, Zelthra Lib",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Keldaris.node('Model', shape)",
  "n71_answer": "Keldaris.node('Model', shape)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "migration_tool",
  "question": "What migration tool do we use?",
  "gold_answer": "Uncertain — previously 'Zyndra Alter', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "ci_config",
  "question": "What's our CI configuration?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at confluence.aurora.dev/aurora, onboarding guide at docs.aurora.internal/onboarding, and changelog at CHANGELOG.md in repo root.",
  "n71_answer": "confluence.aurora.dev/aurora",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-035",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "narvex-registry/aurora:latest",
  "n71_answer": "narvex-registry/aurora:latest",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/narvex/Dockerfile",
  "n71_answer": "infra/narvex/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "Veltrion UI Kit v2",
  "n71_answer": "Veltrion UI Kit v2",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Selvex Secrets, Kerberos Trove, Orinthal Sealed",
  "n71_answer": "Selvex Secrets -> Kerberos Trove -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "korinth-repo/aurora:stable",
  "n71_answer": "narvex-registry/aurora:latest",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'dev.orinthal.dev/aurora', but deploy_target changed",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/narvex/Dockerfile', but docker_image changed",
  "n71_answer": "infra/narvex/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is volunteer-based with weekly signup, alerts go to #zenith-incidents, and runbook is at github.com/aurora-team/runbooks.",
  "n71_answer": "The team's on-call rotation is volunteer-based with weekly signup. It is pretty flexible.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-036",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "thandrel ship --channel stable",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "staging.narvex.internal/aurora",
  "n71_answer": "staging.narvex.internal/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/velturis/Dockerfile",
  "n71_answer": "infra/velturis/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "thandrel-images/aurora:release",
  "n71_answer": "thandrel-images/aurora:release",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Sigilix Manager, Velthri Keys",
  "n71_answer": "Sigilix Manager, Velthri Keys",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "sink.orinthal.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "velturis deploy --region ap-northeast",
  "n71_answer": "thandrel ship --channel stable",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "Uncertain — previously 'staging.narvex.internal/aurora', but deploy_target changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/velturis/Dockerfile', but docker_image changed",
  "n71_answer": "infra/velturis/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "docker_image",
  "question": "What Docker image do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Friday 9:30am, uses #aurora-dev for chat, and standups are Monday/Thursday sync, async rest.",
  "n71_answer": "The team coordinates using a standup format of Monday/Thursday sync meetings with asynchronous communication for the rest of the time. They use the #aurora-dev channel for team communication. The onboarding guide is centralized at wiki.aurora.dev/getting-started to help new team members integrate. The branching strategy is trunk-based with feature flags. Deployment is managed on the Zephyra Platform with the deploy command \"thandrel ship --channel stable\" and CI config \"thandrel-flow.toml.\" The secret manager currently in use is Velthri Keys. Monitoring is done via the dashboard at status.orinthal.dev/aurora.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-037",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "modular with /modules and /shared",
  "n71_answer": "monorepo with /apps and /packages",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "karnex test --ci",
  "n71_answer": "zelthra assert --suite all",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "karnex build --prod",
  "n71_answer": "karnex build --prod",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "secret_manager",
  "question": "What secret manager do we use?",
  "gold_answer": "Orinthal Sealed",
  "n71_answer": "Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Veltrion Pkg, Synthari Modules",
  "n71_answer": "Pyronis Bundle, Veltrion Pkg, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "monorepo with /apps and /packages",
  "n71_answer": "/modular with /modules and /shared",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "zelthra assert --suite all",
  "n71_answer": "karnex test --ci",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Karnex Bundler', but framework changed",
  "n71_answer": "Karnex Bundler",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'karnex build --prod', but build_tool changed",
  "n71_answer": "karnex build --prod",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "secret_manager",
  "question": "What secret manager do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at wiki.aurora.dev/getting-started, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "aurora-docs.readthedocs.io",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-038",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so.\nBacktrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select.\nCore dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z.\nLast known good build was commit 7a4f9e2; bisect against recent migrations.",
  "n71_answer": "SegmentationFault at 0x00007fff5a3b2c40 in vorathis_query_planner.so. Backtrace: frame 0 query_optimize, frame 1 plan_join, frame 2 execute_select. Core dump saved to /var/crash/core.aurora.18923 on 2024-06-12T03:55:09Z. Last known good build was commit 7a4f9e2; bisect against recent migrations.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "status.orinthal.dev/aurora",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "orinthal publish --tier sandbox",
  "n71_answer": "orinthal publish --tier sandbox",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.korinth.dev/aurora",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/orinthal/Dockerfile",
  "n71_answer": "infra/orinthal/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "JWT Bearer",
  "n71_answer": "JWT Bearer",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 May 9, 2025 June 6, 2025 June 20",
  "n71_answer": "2025 May 9, 2025 June 6, 2025 June 20",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "monitoring_url",
  "question": "What's our monitoring URL?",
  "gold_answer": "metrics.thandrel.io/aurora-prod",
  "n71_answer": "status.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "velturis deploy --region us-east",
  "n71_answer": "orinthal publish --tier sandbox",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'logs.korinth.dev/aurora', but deploy_target changed",
  "n71_answer": "logs.korinth.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/orinthal/Dockerfile', but docker_image changed",
  "n71_answer": "infra/orinthal/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Moxira Components v3, target 90% for core modules test coverage, and require 1 approval + CI green for reviews.",
  "n71_answer": "1 approval + CI green",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-039",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "24 hours",
  "n71_answer": "24 hours",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-hmac-validator v2.0",
  "n71_answer": "phalanx-hmac-validator v2.0",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "Session Cookie + CSRF",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.record('Model', schema)",
  "n71_answer": "Pyravar.record('Model', schema)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "I don't have that information.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Synthari Modules",
  "n71_answer": "Quorix Hub, Dranith Stack, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "4 hours",
  "n71_answer": "4 hours",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "unknown",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'Session Cookie + CSRF', but auth_provider changed",
  "n71_answer": "mTLS Client Cert",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use ship/show/ask branching branching on release, releasing monthly first Monday.",
  "n71_answer": "The team releases monthly on the first Monday.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-040",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Vorathis.collection('Model', spec)",
  "n71_answer": "Vorathis.collection('Model', spec)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "pyravar://data-primary:3306/aurora",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "docs.aurora.internal/onboarding",
  "n71_answer": "docs.internal/onboarding",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 April 25, 2025 May 9",
  "n71_answer": "2023-03-02, 2023-03-28, 2023-04-25, 2025-04-25, 2025-05-09",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at midnight KST",
  "n71_answer": "daily at midnight KST",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Pyravar.entity('Model', attrs)",
  "n71_answer": "Pyravar.entity('Model', attrs)",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "Uncertain — previously 'pyravar://data-primary:3306/aurora', but database changed",
  "n71_answer": "pyravar://data-primary:3306/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Wednesday 10am, uses #aurora-core for chat, and standups are 15-min daily video call.",
  "n71_answer": "- Team Meeting: every Wednesday at 10am  \n- Team Meeting Purpose: sync up on progress and blockers  \n- Team Standup Format: 15-minute daily video call  \n- Team Channel: #aurora-core  \n- Code Review Policy: any team member can approve  \n- Branching Strategy: trunk-based with feature flags  \n- On-call Rotation: biweekly rotation",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-041",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "VP Taehyun Kim",
  "n71_answer": "VP Taehyun Kim",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Emily Cho",
  "n71_answer": "Emily Cho",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "deploy_target",
  "question": "Where do we deploy?",
  "gold_answer": "Velturis Edge",
  "n71_answer": "Velturis Edge",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "sprint_deadline",
  "question": "List all sprint deadlines we've had, from earliest to latest.",
  "gold_answer": "2025 March 28, 2025 May 23, 2025 June 6",
  "n71_answer": "2025 March 28  \n2025 May 23  \n2025 June 6",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "approval_authority",
  "question": "Who has approval authority?",
  "gold_answer": "Director Hyunwoo Nam",
  "n71_answer": "Director Hyunwoo Nam",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Emily Cho', but team_lead changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "deploy_target",
  "question": "Where do we deploy?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use release-train with weekly cuts branching on develop, releasing biweekly Friday afternoon.",
  "n71_answer": "- The team uses a release-train model with weekly cuts branching.  \n- The release cadence is biweekly on Friday afternoon.  \n- The default branch is develop, used for all ongoing work.  \n- Feature branches are created off develop for new work.  \n- Regular merging from develop into feature branches is practiced to avoid conflicts.  \n- Code reviews are required before merging back into develop to maintain code quality.  \n- Changelogs are auto-generated from commit messages, with conventions encouraged for clarity (e.g., prefixes like \"feat:\" and \"fix:\").  \n- As release day approaches, tasks are prioritized focusing on critical features and blockers.  \n- Regular check-ins or stand-ups occur in the days leading up to release to keep alignment and identify issues early.  \n- A freeze period is considered where only essential fixes are made before release.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-042",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-backup:8529/aurora",
  "n71_answer": "thalnex://store-backup:8529/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Zyndra.struct('Model', types)",
  "n71_answer": "Zyndra.struct('Model', types)",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "twice daily at 2am and 2pm UTC",
  "n71_answer": "twice daily at 2am and 2pm UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-key-guard v1.3",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Tarvex Mod, Dranith Stack, Synthari Modules",
  "n71_answer": "Tarvex Mod -> Dranith Stack -> Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "connection_string",
  "question": "What's the database connection string?",
  "gold_answer": "thalnex://store-prod:8529/aurora",
  "n71_answer": "thalnex://store-prod:8529/aurora",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "model_syntax",
  "question": "What's the model syntax?",
  "gold_answer": "Thalnex.document('Model', {fields})",
  "n71_answer": "Thalnex.document('Model', {fields})",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "Uncertain — previously 'twice daily at 2am and 2pm UTC', but database changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "phalanx-key-guard v1.3",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "oncall_rotation,alert_channel,incident_runbook_url",
  "question": "What do you know about our on-call setup? Tell me everything.",
  "gold_answer": "On-call is biweekly rotation, alerts go to #aurora-alerts, and runbook is at wiki.aurora.dev/incident-response.",
  "n71_answer": "biweekly",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-043",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Quorix Spec",
  "n71_answer": "Quorix Spec",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "quorix spec --run all",
  "n71_answer": "quorix spec --run all",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Zelthra Pack",
  "n71_answer": "C#",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "zelthra pack --minify",
  "n71_answer": "zelthra pack --minify",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Pyronis Bundle, Quorix Hub, Synthari Modules",
  "n71_answer": "Pyronis Bundle, Quorix Hub, Synthari Modules",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_framework",
  "question": "What testing framework do we use?",
  "gold_answer": "Pyronis Check",
  "n71_answer": "Pyronis Check",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "pyronis check --snapshot",
  "n71_answer": "pyronis check --snapshot",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Uncertain — previously 'Zelthra Pack', but framework changed",
  "n71_answer": "C#",
  "judge_pass": false,
  "pass_type": "never_knew",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'zelthra pack --minify', but build_tool changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Quenthar Design v1, target 80% minimum overall test coverage, and require 2 approvals required for reviews.",
  "n71_answer": "2 approvals required for code review  \nTest coverage target overall: 80% minimum",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-044",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Kevin Park",
  "n71_answer": "Kevin Park",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Seoyun Choi",
  "n71_answer": "Seoyun Choi",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "docs.aurora.internal/onboarding",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Kerberos Trove, Pyronos Crypt",
  "n71_answer": "Verisigil Lock, Kerberos Trove, Pyronos Crypt",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "escalation_contact",
  "question": "Who's the escalation contact?",
  "gold_answer": "Jiwon Han",
  "n71_answer": "Jiwon Han",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "code_reviewer",
  "question": "Who reviews our code?",
  "gold_answer": "Uncertain — previously 'Seoyun Choi', but team_lead changed",
  "n71_answer": "Seoyun Choi",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "onboarding_guide",
  "question": "Where's the onboarding guide?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/onboarding",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 80% minimum overall test coverage, and require 2 approvals + security review for auth/ for reviews.",
  "n71_answer": "2 approvals + a security review for the auth/ directory",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-045",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23.\nStack trace shows recursive call in middleware_chain to request_validator to token_parser.\nRestart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "n71_answer": "PanicException: thread 'tokio-runtime-worker' panicked at 'index out of bounds: the len is 0 but the index is 5', src/handlers/auth.rs:142:23. Stack trace shows recursive call in middleware_chain to request_validator to token_parser. Restart triggered automatic failover at 2024-04-02T14:18:33Z.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Tarvex CLI",
  "n71_answer": "Tarvex CLI",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "dranith probe --mode fast",
  "n71_answer": "dranith probe --mode fast",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "workspace with /services and /common",
  "n71_answer": "/ services and / common",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "tarvex compile --debug",
  "n71_answer": "tarvex compile --debug",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "daily at 3am UTC",
  "n71_answer": "daily at 3am UTC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Verisigil Lock, Pyronos Crypt, Orinthal Sealed",
  "n71_answer": "Verisigil Lock -> Pyronos Crypt -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Karnex Bundler",
  "n71_answer": "Karnex Bundler",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "test_command",
  "question": "What's the test command?",
  "gold_answer": "zelthra assert --suite all",
  "n71_answer": "zelthra assert --suite all",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "project_structure",
  "question": "What's our project structure?",
  "gold_answer": "Uncertain — previously 'workspace with /services and /common', but framework changed",
  "n71_answer": "unknown",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "build_command",
  "question": "What's the build command?",
  "gold_answer": "Uncertain — previously 'tarvex compile --debug', but build_tool changed",
  "n71_answer": "zelthra assert --suite all",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "backup_schedule",
  "question": "What's our backup schedule?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "meeting_day,slack_channel,standup_format",
  "question": "How does our team coordinate? Tell me everything you know.",
  "gold_answer": "Team meets every Wednesday 10am, uses #vortex-team for chat, and standups are weekly sync + daily async.",
  "n71_answer": "- Standup format: weekly sync + daily async  \n- Meeting schedule: every Wednesday at 10am  \n- On-call rotation: follow-the-sun across timezones  \n- Team channel: #vortex-team",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-046",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "API Key + HMAC",
  "n71_answer": "API Key + HMAC",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "phalanx-hmac-validator v2.0",
  "n71_answer": "phalanx-hmac-validator v2.0",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Zenthos-Session-v1",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "Dranith Compiler",
  "n71_answer": "Dranith Compiler",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Pyronos Crypt, Orinthal Sealed",
  "n71_answer": "Kerberos Trove -> Orinthal Sealed -> Pyronos Crypt",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "mTLS Client Cert",
  "n71_answer": "API Key + HMAC",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "phalanx-hmac-validator v2.0",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "token_format",
  "question": "What token format do we use?",
  "gold_answer": "Uncertain — previously 'Zenthos-Session-v1', but auth_provider changed",
  "n71_answer": "Zenthos-Session-v1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "build_tool",
  "question": "What's our build tool?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "design_system,test_coverage_target,code_review_policy",
  "question": "What are our code quality standards? List everything you know.",
  "gold_answer": "We use Synthari Elements v1.2, target 70% overall, 95% for auth test coverage, and require 1 approval + CI green for reviews.",
  "n71_answer": "- Code review policy: 1 approval + CI green  \n- Prehooks to prevent code upload if it does not pass compilation, tests, and validation  \n- Test coverage target overall: 70%  \n- Test coverage target for authentication: 95%  \n- Use of pytest testing framework  \n- Continuous integration on main branch in staging environment  \n- Use of trunk based development git workflow  \n- Branch by abstraction deployment strategy (feature/integration/abstraction flags)",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-047",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "ConnectionRefusedError: [Errno 111] Connection refused.\nService crysthene-db-primary on port 5432 is not accepting connections.\nLast successful heartbeat was at 2024-03-15T08:23:41Z.\nCheck firewall rules and verify that max_connections has not been exceeded.",
  "n71_answer": "ConnectionRefusedError: [Errno 111] Connection refused. Service crysthene-db-primary on port 5432 is not accepting connections. Last successful heartbeat was at 2024-03-15T08:23:41Z. Check firewall rules and verify that max_connections has not been exceeded.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/cerberix/saml/sso-init",
  "n71_answer": "/cerberix/saml/sso-init",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "oathkeeper-mtls-proxy v4.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "stream.velturis.io/aurora",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "package_manager",
  "question": "List all package managers we've used over time, from earliest to latest.",
  "gold_answer": "Quorix Hub, Dranith Stack, Veltrion Pkg",
  "n71_answer": "Quorix Hub -> Veltrion Pkg -> yo @microsoft/sharepoint",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "login_endpoint",
  "question": "What's the login endpoint?",
  "gold_answer": "/auth/kryptal/signin",
  "n71_answer": "/api/reset-password",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "verithos-oauth-middleware v2.1",
  "n71_answer": "oathkeeper-mtls-proxy v4.1",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Uncertain — previously 'SAML 2.0 SSO', but auth_provider changed",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "stream.velturis.io/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "branch_strategy,default_branch,release_cadence",
  "question": "What do you know about our release process? Tell me everything.",
  "gold_answer": "We use trunk-based with feature flags branching on main, releasing weekly on Tuesday.",
  "n71_answer": "- Release cadence: weekly on Tuesday  \n- Changelog location: GitHub Releases page  \n- The team uses trunk-based development with feature flags for branching  \n- Releases are monitored and adjusted based on sprint feedback to improve the process  \n- Gathering insights from the team after each release is recommended to identify issues or bottlenecks  \n- Evaluating release stability and incorporating more testing before deployment is advised if frequent bugs or hotfixes occur  \n- Keeping open communication with the team about what works and what doesn’t is important for improvements",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-048",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders.\nQuery 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'.\nQuery 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'.\nFirst seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "n71_answer": "DeadlockDetected: transactions 4821 and 4823 are deadlocked on table aurora_orders. Query 1 was UPDATE orders SET status = 'shipped' WHERE region = 'us-east'. Query 2 was DELETE FROM orders WHERE created_at < NOW() - INTERVAL '30 days'. First seen at 2024-08-04T22:33:18Z; recommend serializable isolation review.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "SAML 2.0 SSO",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "cerberix-sso-bridge v1.9",
  "n71_answer": "cerberix-sso-bridge v1.9",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "30 minutes",
  "n71_answer": "30 minutes",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "docs.aurora.internal/runbooks",
  "n71_answer": "docs.aurora.internal/runbooks",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Selvex Secrets, Verisigil Lock, Orinthal Sealed",
  "n71_answer": "Selvex Secrets -> Verisigil Lock -> Orinthal Sealed",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "auth_method",
  "question": "What authentication method do we use?",
  "gold_answer": "Session Cookie + CSRF",
  "n71_answer": "SAML 2.0 SSO",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "middleware_config",
  "question": "What's the auth middleware config?",
  "gold_answer": "zenthos-csrf-guard v1.1",
  "n71_answer": "cerberix-sso-bridge v1.9",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "user_session_ttl",
  "question": "What's the session TTL?",
  "gold_answer": "Uncertain — previously '30 minutes', but auth_provider changed",
  "n71_answer": "30 minutes",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "incident_runbook_url",
  "question": "Where's the incident runbook?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "docs.aurora.internal/runbooks",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at github.com/aurora-team/docs, onboarding guide at docs.aurora.internal/onboarding, and changelog at notion.so/aurora-team/releases.",
  "n71_answer": "github.com/aurora-team/docs  \ndocs.aurora.internal/onboarding",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-049",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal.\nSubject CN does not match expected hostname; chain depth 2 unverified.\nObserved at 2024-07-22T11:09:47Z by tls_handshake_listener.\nRotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "n71_answer": "CertificateVerificationError: certificate chain validation failed for crysthene-db-primary.internal. Subject CN does not match expected hostname; chain depth 2 unverified. Observed at 2024-07-22T11:09:47Z by tls_handshake_listener. Rotate the cert via Cryptara Vault and restart the connection pool to recover.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "dev.orinthal.dev/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "korinth release --canary",
  "n71_answer": "korinth release --canary",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "logs.narvex.io/aurora/drain",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "infra/korinth/Dockerfile",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "Moxira Components v3",
  "n71_answer": "Moxira Components v3",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "before",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": false,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Tr",
  "entity": "secret_manager",
  "question": "List all secret managers we've used over time, from earliest to latest.",
  "gold_answer": "Kerberos Trove, Pyronos Crypt, Velthri Keys",
  "n71_answer": "Kerberos Trove, Pyronos Crypt, Velthri Keys",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "staging_url",
  "question": "What's our staging URL?",
  "gold_answer": "sandbox.thandrel.io/aurora",
  "n71_answer": "dev.orinthal.dev/aurora",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Cas",
  "entity": "deploy_command",
  "question": "What's the deploy command?",
  "gold_answer": "narvex deploy --env prod",
  "n71_answer": "korinth release --canary",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "log_drain_endpoint",
  "question": "What's our log drain endpoint?",
  "gold_answer": "Uncertain — previously 'logs.narvex.io/aurora/drain', but deploy_target changed",
  "n71_answer": "logs.narvex.io/aurora/drain",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Abs",
  "entity": "dockerfile_path",
  "question": "Where's the Dockerfile?",
  "gold_answer": "Uncertain — previously 'infra/korinth/Dockerfile', but docker_image changed",
  "n71_answer": "infra/korinth/Dockerfile",
  "judge_pass": false,
  "pass_type": "knew_but_failed",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Del",
  "entity": "design_system",
  "question": "What design system do we use?",
  "gold_answer": "No — explicitly removed",
  "n71_answer": "none",
  "judge_pass": true,
  "pass_type": "real",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "Agg",
  "entity": "documentation_url,onboarding_guide,changelog_location",
  "question": "Where can I find project documentation? List everything you know.",
  "gold_answer": "Docs are at aurora-docs.readthedocs.io, onboarding guide at docs.aurora.internal/onboarding, and changelog at docs.aurora.internal/changelog.",
  "n71_answer": "aurora-docs.readthedocs.io  \ndocs.aurora.internal/onboarding  \ndocs.aurora.internal/changelog",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 },
 {
  "episode": "sw-050",
  "domain": "sw",
  "phase": "after",
  "task": "ER",
  "entity": "error_log",
  "question": "Recite the exact error log I shared.",
  "gold_answer": "OutOfMemoryError: Java heap space.\nGC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'.\nHeap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z.\nRecommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "n71_answer": "OutOfMemoryError: Java heap space. GC overhead limit exceeded after 8 consecutive failed collections in pool 'PS Old Gen'. Heap dump written to /var/log/aurora/heap-2024-05-08.hprof at 18:42:11Z. Recommend increasing -Xmx beyond 4G or investigating memory leak in OrderProcessor.",
  "judge_pass": true,
  "pass_type": "",
  "failure_class": ""
 }
]