local multiturn sql lab

import%20marimo%0A%0A__generated_with%20%3D%20%220.23.8%22%0Aapp%20%3D%20marimo.App(width%3D%22wide%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20pandas%20as%20pd%0A%0A%20%20%20%20from%20notebooks.labs.local_multiturn_sql_lab_support%20import%20(%0A%20%20%20%20%20%20%20%20lab_turns%2C%0A%20%20%20%20%20%20%20%20run_multiturn_lab%2C%0A%20%20%20%20)%0A%0A%20%20%20%20return%20lab_turns%2C%20mo%2C%20pd%2C%20run_multiturn_lab%0A%0A%0A%40app.cell%0Adef%20_(mo)%3A%0A%20%20%20%20runtime_choice%20%3D%20mo.ui.dropdown(%0A%20%20%20%20%20%20%20%20options%3D%5B%22auto%22%2C%20%22cpu%22%2C%20%22cuda%22%2C%20%22mps%22%2C%20%22xpu%22%5D%2C%0A%20%20%20%20%20%20%20%20value%3D%22auto%22%2C%0A%20%20%20%20%20%20%20%20label%3D%22Runtime%22%2C%0A%20%20%20%20)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%20Local%20multi-turn%20SQL%20lab%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20This%20is%20the%20runnable%20companion%20to%20the%20blog%20post.%20It%20starts%20from%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20the%20same%20question%20as%20the%20project%3A%20single-turn%20SQL%20benchmark%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20performance%20is%20improving%2C%20but%20multi-turn%20data%20analysis%20still%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20fails%20when%20state%2C%20semantic%20concepts%2C%20metric%20intent%2C%20and%20recovery%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20matter.%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%201.%20Runtime%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20The%20lab%20uses%20%60auto%60%20by%20default%3A%20auto-select%20CUDA%2C%20MPS%2C%20or%20XPU%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20when%20PyTorch%20detects%20one%2C%20otherwise%20fall%20back%20to%20CPU.%20Force%20CPU%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20when%20you%20want%20the%20most%20portable%20run.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20runtime_choice%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%20(runtime_choice%2C)%0A%0A%0A%40app.cell%0Adef%20_(run_multiturn_lab%2C%20runtime_choice)%3A%0A%20%20%20%20report%20%3D%20run_multiturn_lab(device_preference%3Druntime_choice.value)%0A%20%20%20%20return%20(report%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd%2C%20report)%3A%0A%20%20%20%20device%20%3D%20report%5B%22device%22%5D%0A%20%20%20%20detected%20%3D%20report%5B%22detected_accelerator%22%5D%0A%20%20%20%20runtime_summary%20%3D%20pd.DataFrame(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22requested_device%22%3A%20report%5B%22runtime_policy%22%5D%5B%22device_preference%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22selected_device%22%3A%20device.kind%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22selected_label%22%3A%20device.label%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22detected_accelerator%22%3A%20detected.label%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22fallback%22%3A%20report%5B%22runtime_policy%22%5D%5B%22fallback%22%5D%20or%20%22none%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22scenario_hash%22%3A%20report%5B%22scenario_contract%22%5D%5B%22shared_input_sha256%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(runtime_summary%2C%20label%3D%22Runtime%20selected%20by%20the%20lab%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20report%5B%22accelerator_report%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20label%3D%22CUDA%2FMPS%2FXPU%20visibility%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(lab_turns%2C%20mo%2C%20pd)%3A%0A%20%20%20%20turns%20%3D%20pd.DataFrame(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22turn_id%22%3A%20turn.turn_id%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22question%22%3A%20turn.question%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22context_note%22%3A%20turn.context_note%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22requires_recovery%22%3A%20turn.requires_recovery%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22reference_sql%22%3A%20turn.reference_sql%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20turn%20in%20lab_turns()%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%202.%20Multi-turn%20task%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20The%20toy%20warehouse%20has%20customers%20and%20orders.%20The%20four%20turns%20keep%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20the%20metric%20alive%2C%20add%20a%20follow-up%20filter%2C%20change%20the%20grain%2C%20and%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20ask%20the%20system%20to%20recover%20after%20an%20empty%20result.%20This%20is%20small%20on%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20purpose%3A%20the%20behavior%20is%20visible%20without%20a%20model%20download.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(turns%2C%20label%3D%22Conversation%20turns%20and%20reference%20SQL%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd%2C%20report)%3A%0A%20%20%20%20method_matrix%20%3D%20pd.DataFrame(report%5B%22method_matrix%22%5D)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%203.%20Candidate%20training%20targets%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20The%20lab%20compares%20direct%20SQL%20with%20four%20richer%20targets%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20planner-first%20SQL%2C%20semantic%20value%20grounding%2C%20a%20%60MEASURE()%60-%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20preserving%20DSL%2C%20and%20behavior%2Frecovery%20tuning.%20The%20goal%20is%20to%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20decide%20what%20a%20small%20specialized%20model%20should%20learn%20before%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20spending%20larger%20GPU%20time.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(method_matrix%2C%20label%3D%22Training%20targets%20compared%20in%20the%20lab%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd%2C%20report)%3A%0A%20%20%20%20scores%20%3D%20pd.DataFrame(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%22system%22%3A%20system%2C%20**metrics%7D%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20system%2C%20metrics%20in%20report%5B%22systems%22%5D.items()%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20).sort_values(%5B%22value_accuracy%22%2C%20%22measure_preservation_rate%22%5D%2C%20ascending%3DFalse)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%204.%20Lab%20scorecard%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%60value_accuracy%60%20says%20whether%20the%20returned%20rows%20match.%20The%20other%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20columns%20separate%20why%20a%20system%20got%20there%3A%20context%20carryover%2C%20value%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20grounding%2C%20metric%20preservation%2C%20and%20recovery.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(scores%2C%20label%3D%22Value%20and%20subtask%20scores%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd%2C%20report)%3A%0A%20%20%20%20trace_columns%20%3D%20%5B%0A%20%20%20%20%20%20%20%20%22turn_id%22%2C%0A%20%20%20%20%20%20%20%20%22question%22%2C%0A%20%20%20%20%20%20%20%20%22system%22%2C%0A%20%20%20%20%20%20%20%20%22value_match%22%2C%0A%20%20%20%20%20%20%20%20%22context_carryover%22%2C%0A%20%20%20%20%20%20%20%20%22value_grounded%22%2C%0A%20%20%20%20%20%20%20%20%22measure_preserved%22%2C%0A%20%20%20%20%20%20%20%20%22recovery_success%22%2C%0A%20%20%20%20%20%20%20%20%22failure_type%22%2C%0A%20%20%20%20%20%20%20%20%22actual_rows%22%2C%0A%20%20%20%20%20%20%20%20%22expected_rows%22%2C%0A%20%20%20%20%5D%0A%20%20%20%20trace%20%3D%20pd.DataFrame(report%5B%22rows%22%5D)%0A%20%20%20%20failures%20%3D%20trace.loc%5B~trace%5B%22value_match%22%5D%2C%20trace_columns%5D%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%205.%20Failure%20trace%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20Multi-turn%20SQL%20failures%20should%20not%20be%20collapsed%20into%20%22bad%20SQL.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20A%20miss%20can%20come%20from%20forgotten%20context%2C%20an%20ungrounded%20value%2C%20lost%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20metric%20semantics%2C%20or%20a%20missing%20repair%20behavior.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(trace%5Btrace_columns%5D%2C%20label%3D%22All%20turn-level%20outcomes%22)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(failures%2C%20label%3D%22Rows%20that%20expose%20trainable%20failures%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%20(trace%2C)%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20trace)%3A%0A%20%20%20%20plans%20%3D%20trace%5B%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22turn_id%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22question%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22system%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22intermediate_plan%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22sql%22%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20%5D%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%206.%20Intermediate%20state%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20This%20is%20the%20part%20a%20pure%20SQL%20target%20hides.%20Planner-first%20training%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20makes%20filters%20and%20grain%20explicit.%20Semantic%20grounding%20maps%20display%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20values%20to%20storage%20values.%20A%20DSL%20keeps%20%60MEASURE()%60%20intent%20alive%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20until%20compilation.%20Recovery%20tuning%20uses%20feedback%20from%20the%20last%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20turn%20instead%20of%20blindly%20retrying%20the%20same%20query.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(plans%2C%20label%3D%22Intermediate%20plans%20and%20SQL%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd%2C%20report)%3A%0A%20%20%20%20fixtures%20%3D%20pd.DataFrame(report%5B%22synthetic_fixture_table%22%5D)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%207.%20Synthetic%20fixture%20pack%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20The%20tiny%20lab%20shows%20the%20behavior.%20The%20synthetic%20fixture%20pack%20turns%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20those%20behaviors%20into%20versioned%20rows%20the%20repo%20can%20use%20before%20a%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20larger%20endpoint%20run%3A%20value%20normalization%2C%20entity%20resolution%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20grain%2Ffanout%2C%20%60MEASURE()%60%20preservation%2C%20and%20recovery.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(fixtures%2C%20label%3D%22Synthetic%20fixtures%20promoted%20from%20the%20lab%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mo%2C%20pd)%3A%0A%20%20%20%20next_gates%20%3D%20pd.DataFrame(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22target%22%3A%20%22planner-first%20SQL%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22next_gate%22%3A%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Predict%20non-oracle%20plans%2C%20then%20compare%20generated%20SQL%20against%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22same-model%20direct%20SQL%20on%20identical%20rows.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22target%22%3A%20%22semantic%20layer%20concepts%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22next_gate%22%3A%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Build%20value%2Fentity%20indexes%20and%20score%20value%20normalization%2C%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22grain%2C%20joins%2C%20and%20governed%20dimensions%20separately.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22target%22%3A%20%22DSL%20first%2C%20SQL%20after%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22next_gate%22%3A%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Train%20DSL-preserving%20outputs%2C%20compile%20them%20to%20SQL%2C%20then%20compare%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22compiled%20execution%20against%20direct%20SQL.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22target%22%3A%20%22MEASURE()%20preservation%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22next_gate%22%3A%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Track%20whether%20governed%20metrics%20survive%20generation%20before%20SQL%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22expansion%2C%20especially%20on%20metric-heavy%20tasks.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22target%22%3A%20%22behavior%2Frecovery%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22next_gate%22%3A%20(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Evaluate%20generated-history%20rollouts%20so%20the%20model%20has%20to%20recover%20%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22from%20its%20own%20previous%20misses.%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20mo.vstack(%0A%20%20%20%20%20%20%20%20%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.md(%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%23%23%208.%20What%20this%20proves%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20This%20lab%20is%20not%20a%20benchmark%20result%20and%20it%20is%20not%20a%20hosted-SOTA%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20comparison.%20It%20is%20a%20small%20executable%20argument%20for%20what%20the%20repo%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20should%20measure%20next%3A%20whether%20a%20small%20local%20model%20can%20learn%20the%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20behavior%20and%20semantic%20concepts%20that%20multi-turn%20data%20analysis%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20requires.%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22%22%22%0A%20%20%20%20%20%20%20%20%20%20%20%20)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20mo.ui.table(next_gates%2C%20label%3D%22Next%20evidence%20gates%22)%2C%0A%20%20%20%20%20%20%20%20%5D%0A%20%20%20%20)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A