yanting/report-notebooklm-api/scripts/import_seed_content.py

from __future__ import annotations

import asyncio
import csv
import datetime as dt
import hashlib
import re
import json
import sys
from typing import Any
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from sqlalchemy import delete, select
from sqlalchemy.ext.asyncio import AsyncSession

from app.db import Base, SessionLocal, engine
from app.models import (
    AudioAsset,
    DisplayArtifact,
    DisplayModule,
    Favorite,
    Institution,
    OutboundEvent,
    PlaybackProgress,
    RawArtifact,
    ReadingHistory,
    RelatedNews,
    Report,
    SavedListen,
    User,
)


def j(value: Any) -> str:
    return json.dumps(value, ensure_ascii=False, separators=(",", ":"))


def d(value: str) -> dt.datetime:
    return dt.datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)


def etag(value: Any) -> str:
    return hashlib.sha256(j(value).encode("utf-8")).hexdigest()[:16]


REAL_SAMPLE_REPORT_ID = "rep_bis_notebooklm_sample"
REAL_SAMPLE_ROOT = (
    Path.home()
    / "Projects/team-project/mall-docs/products/type3-orbit/report-notebooklm/docs.jimme.local/report-notebooklm/notebooklm-capability-bis-2026-06-02"
)
REAL_SAMPLE_ARTIFACTS = REAL_SAMPLE_ROOT / "artifacts"


def read_real_sample(name: str) -> str:
    path = REAL_SAMPLE_ARTIFACTS / name
    if not path.exists():
        return ""
    return path.read_text(encoding="utf-8-sig")


def clean_markdown_text(value: str) -> str:
    text = re.sub(r"\[\d+(?:[-, ]+\d+)*\]", "", value)
    text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
    text = text.replace("`", "")
    return re.sub(r"\s+", " ", text).strip()


def markdown_sections(markdown: str, *, min_level: int = 2, limit: int = 8) -> list[dict[str, str]]:
    sections: list[dict[str, str]] = []
    current_heading = ""
    current_lines: list[str] = []
    heading_re = re.compile(r"^(#{%d,4})\s+(.+)$" % min_level)
    for raw_line in markdown.splitlines():
        line = raw_line.strip()
        if line == "## Citations":
            break
        match = heading_re.match(line)
        if match:
            if current_heading and current_lines:
                body = clean_markdown_text("\n".join(current_lines))
                if body:
                    sections.append({"heading": current_heading, "body": body})
            current_heading = clean_markdown_text(match.group(2))
            current_lines = []
            continue
        if current_heading and line and not line.startswith("---") and not line.startswith("|"):
            current_lines.append(line)
    if current_heading and current_lines:
        body = clean_markdown_text("\n".join(current_lines))
        if body:
            sections.append({"heading": current_heading, "body": body})
    return sections[:limit]


def numbered_sections(markdown: str, *, limit: int = 8) -> list[dict[str, str]]:
    sections: list[dict[str, str]] = []
    pattern = re.compile(r"^(?:###\s*)?\d+\.\s+\**(.+?)\**$")
    current_heading = ""
    current_lines: list[str] = []
    for raw_line in markdown.splitlines():
        line = raw_line.strip()
        if line == "## Citations":
            break
        match = pattern.match(line)
        if match:
            if current_heading and current_lines:
                body = clean_markdown_text("\n".join(current_lines))
                if body:
                    sections.append({"heading": current_heading, "body": body})
            current_heading = clean_markdown_text(match.group(1))
            current_lines = []
            continue
        if current_heading and line and not line.startswith("#") and not line.startswith("---"):
            current_lines.append(line)
    if current_heading and current_lines:
        body = clean_markdown_text("\n".join(current_lines))
        if body:
            sections.append({"heading": current_heading, "body": body})
    return sections[:limit]


def split_heading_body(section: dict[str, str]) -> tuple[str, str]:
    body = section["body"]
    parts = re.split(r"研报观点与证据：|证据：|影响：", body, maxsplit=1)
    if len(parts) == 2:
        return clean_markdown_text(parts[0]), clean_markdown_text(parts[1])
    return "", body


def key_data_rows() -> list[dict[str, str]]:
    csv_text = read_real_sample("data-table.csv")
    rows: list[dict[str, str]] = []
    if csv_text:
        for row in csv.DictReader(csv_text.splitlines()):
            rows.append(
                {
                    "metric": row.get("数据点/指标名称", ""),
                    "value": row.get("定量数值或趋势", ""),
                    "unit": "",
                    "importance": row.get("风险/修订指示", ""),
                    "judgment": row.get("相关行业或资产类别", ""),
                }
            )
    if rows:
        return rows
    return [
        {"metric": "M7 市值占比", "value": "近 35%", "unit": "", "importance": "提示指数集中度风险", "judgment": "美国大型科技股"},
        {"metric": "SRT 覆盖贷款", "value": "约 8000 亿欧元", "unit": "", "importance": "提示隐藏信贷风险规模", "judgment": "银行业 / 非银机构"},
    ]


def sample_artifact_types() -> list[str]:
    return [
        "describe-source",
        "native_briefing_doc",
        "native_blog_post",
        "native_study_guide",
        "data_table",
        "query_dimensions",
        "query_key_data",
        "query_divergence",
        "query_weaknesses",
        "query_timeline",
        "query_related_sources",
        "audio_brief",
    ]


MODULE_TITLES = {
    "basic_info": "报告概览",
    "executive_overview": "报告摘要",
    "audio": "听研报",
    "core_insights": "报告要点",
    "key_data": "报告中的关键数据",
    "differentiated_view": "观点差异",
    "weaknesses": "局限与疑问",
    "timeline": "时间线",
    "study_guide": "术语与问答",
    "structure_graph": "结构梳理",
    "related_sources": "延伸阅读",
    "source_compliance": "报告来源",
}

MODULE_DISPLAY_ORDER = {module_type: index for index, module_type in enumerate(MODULE_TITLES)}


def real_sample_module_envelope(module_type: str, report_id: str, title: str, institution_name: str) -> dict[str, Any]:
    briefing = read_real_sample("briefing-doc.md")
    blog = read_real_sample("blog-post.md")
    study = read_real_sample("study-guide.md")
    dimensions = read_real_sample("query-dimensions.md")
    key_data = read_real_sample("query-key-data.md")
    divergence = read_real_sample("query-divergence.md")
    weaknesses = read_real_sample("query-weaknesses.md")
    timeline = read_real_sample("query-timeline.md")
    related_sources = read_real_sample("query-related-sources.md")

    briefing_sections = markdown_sections(briefing, min_level=2, limit=7)
    blog_sections = markdown_sections(blog, min_level=3, limit=7)
    dimension_sections = numbered_sections(dimensions, limit=5)
    key_data_sections = numbered_sections(key_data, limit=12)
    divergence_sections = numbered_sections(divergence, limit=5)
    weakness_sections = numbered_sections(weaknesses, limit=5)
    timeline_sections = numbered_sections(timeline, limit=10)
    related_sections = numbered_sections(related_sources, limit=8)
    study_faq = numbered_sections(study, limit=5)

    key_rows = key_data_rows()
    core_points = [
        {"kind": "view", "text": "市场表面平静，但底层已经从美国大型科技股向欧洲、日本、新兴市场、价值股和小盘股重新轮动。"},
        {"kind": "number", "text": "M7 在标普 500 指数中的市值占比接近 35%，单一板块波动正在显著影响指数风险。"},
        {"kind": "risk", "text": "AI 基础设施融资从现金流叙事转向债务和表外融资，私人信贷、保险公司和银行授信之间的关联增强。"},
        {"kind": "risk", "text": "白银在 2026 年 1 月先涨超 50%、后单日跌近 30%，暴露了杠杆 ETF 再平衡和保证金触发平仓的放大效应。"},
    ]

    base = {
        "basic_info": {
            "content": {
                "report_id": report_id,
                "title_cn": title,
                "summary_cn": "BIS 2026 年 3 月季度评论，回顾 2025 年 11 月 29 日至 2026 年 3 月 5 日的全球金融市场变化，覆盖市场轮动、AI 融资、私募信贷、贵金属和新兴市场政策反应。",
                "topics": ["宏观金融", "金融稳定", "AI 融资", "非银风险"],
                "interpretation_label": "研报解读",
            }
        },
        "executive_overview": {
            "preview": {
                "preview_summary": "报告认为，本轮变化不是单一资产回调，而是高估值科技股、AI 基础设施融资、贵金属杠杆交易和非银信用链条共同推动的市场重新校准。它的核心价值在于把看似分散的市场波动，放回金融稳定和跨市场传导的框架里理解。",
                "section_count": len(briefing_sections) + len(blog_sections),
                "key_quote_snippet": "全球金融市场在表面的平静下经历了深刻的流向切换与重新校准。",
                "highlights": ["资金从美国大型科技股转向欧洲、日本和新兴市场", "AI 基础设施融资开始暴露信用风险", "贵金属波动显示杠杆交易的放大效应"],
            },
            "full": {
                "intro_cn": "这份报告把 2025 年底到 2026 年初的市场变化概括为一次跨资产重新校准：美国大型科技股降温，资金转向欧洲、日本和新兴市场；AI 融资从高成长叙事进入债务和表外风险阶段；贵金属和非银金融机构的波动说明杠杆与流动性仍是金融稳定的关键变量。",
                "sections": briefing_sections + blog_sections[:4],
                "source_artifacts": ["native_briefing_doc", "native_blog_post"],
            },
        },
        "core_insights": {
            "content": {"points": core_points},
            "full": {
                "points": core_points,
                "dimensions": [{"dimension": item["heading"], "summary": item["body"]} for item in dimension_sections],
            },
        },
        "key_data": {
            "preview": {
                "preview_headline": "8 个真实关键数据点",
                "highlights": [f"{row['metric']}：{row['value']}" for row in key_rows[:3]],
                "row_count": len(key_rows),
            },
            "full": {
                "rows": key_rows,
                "source_artifacts": ["data_table", "query_key_data"],
                "supporting_notes": [{"heading": item["heading"], "body": item["body"]} for item in key_data_sections[:6]],
            },
        },
        "source_compliance": {
            "content": {
                "source_url": "https://www.bis.org/publ/qtrpdf/r_qt2603.htm",
                "source_note": "原文为 BIS Quarterly Review, March 2026 的公开研报；本页仅提供中文解读，不提供解读内容下载。",
                "copyright_cn": "原文版权归发布机构所有；本页为基于公开研报整理的中文阅读辅助。",
                "disclaimer": "本内容仅供研报阅读参考，不构成投资建议。",
                "ai_generated_label": "AI 辅助生成",
            }
        },
        "differentiated_view": {
            "preview": {
                "preview_headline": "5 处与常见叙事的分歧",
                "highlights": [item["heading"] for item in divergence_sections[:3]],
                "divergence_count": len(divergence_sections),
            },
            "full": {
                "divergences": [
                    {
                        "topic": item["heading"],
                        "consensus_view": split_heading_body(item)[0] or "常规叙事没有充分覆盖该维度。",
                        "report_position": split_heading_body(item)[1],
                    }
                    for item in divergence_sections
                ]
            },
        },
        "weaknesses": {
            "preview": {
                "preview_headline": "5 个论证弱点与反方向证据",
                "highlights": [item["heading"] for item in weakness_sections[:3]],
                "item_count": len(weakness_sections),
                "disclaimer_brief": "只做论证质量分析，不做投资建议。",
            },
            "full": {
                "disclaimer_cn": "以下仅分析研报论证质量，不构成投资建议。",
                "verification_notes": ["以上问题需要结合后续市场数据、原文脚注和反方向证据继续验证。"],
                "items": [
                    {
                        "topic": item["heading"],
                        "weakness": item["body"],
                        "counter_evidence": "需要结合后续数据、原文脚注与反方向证据继续验证。",
                    }
                    for item in weakness_sections
                ],
            },
        },
        "timeline": {
            "preview": {
                "preview_headline": "10 个关键事件节点",
                "date_range": "1990s-2026",
                "highlights": [item["heading"] for item in timeline_sections[:3]],
                "event_count": len(timeline_sections),
            },
            "full": {
                "events": [
                    {
                        "date": item["heading"],
                        "period_type": "report_timeline",
                        "event": item["heading"],
                        "impact": item["body"],
                    }
                    for item in timeline_sections
                ]
            },
        },
        "study_guide": {
            "preview": {
                "preview_headline": "术语与问答",
                "faq_count": len(study_faq),
                "glossary_count": 8,
                "sample_question": study_faq[0]["heading"] if study_faq else "为什么要读这份 BIS 季报？",
                "highlights": ["核心概念摘要", "简答练习题", "重要术语表"],
            },
            "full": {
                "intro_cn": "这一部分整理了阅读本篇研报时容易遇到的概念、问题和术语。",
                "faq_items": [{"question": item["heading"], "answer": item["body"]} for item in study_faq],
                "glossary": [
                    {"term": "M7", "definition": "主导美国股市的七大科技巨头。"},
                    {"term": "SRT", "definition": "合成风险转移，银行通过衍生品或担保转移部分信用风险。"},
                    {"term": "BISTRO", "definition": "BIS Time-series Regression Oracle，宏观时间序列预测工具。"},
                    {"term": "NBFI", "definition": "非银行金融机构。"},
                    {"term": "Shadow Borrowing", "definition": "经济实质类似债务、但主要存在于资产负债表外的融资安排。"},
                    {"term": "BDCs", "definition": "业务发展公司，是私募信贷市场的公开交易窗口之一。"},
                    {"term": "Carry Trade", "definition": "借入低息货币、投资高息资产的套利交易。"},
                    {"term": "Margin-triggered Liquidations", "definition": "保证金要求上升触发的被迫平仓。"},
                ],
            },
        },
        "structure_graph": {
            "preview": {
                "preview_headline": "结构梳理",
                "root": "BIS 季报：分析框架",
                "top_nodes": [item["heading"] for item in dimension_sections[:5]],
                "fallback_derived": True,
            },
            "full": {
                "root": "BIS 季报：分析框架",
                "nodes": [
                    {
                        "label": item["heading"],
                        "children": [phrase.strip(" 。") for phrase in re.split(r"[。；;]", item["body"])[:3] if phrase.strip()],
                    }
                    for item in dimension_sections
                ],
                "fallback_derived": True,
                "source_artifacts": ["query_dimensions"],
            },
        },
        "related_sources": {
            "content": {
                "items": [
                    {"title": item["heading"], "source_name": "延伸资料", "summary_cn": item["body"]}
                    for item in related_sections[:3]
                ],
                "review_note": "延伸来源仅作为候选队列，正式展示前需要人工审核。",
            },
            "full": {
                "items": [
                    {"title": item["heading"], "source_name": "延伸资料", "summary_cn": item["body"]}
                    for item in related_sections
                ],
                "review_note": "延伸来源仅作为候选队列，正式展示前需要人工审核。",
            },
        },
        "audio": {
            "content": {
                "audio_id": "aud_bis_notebooklm_sample",
                "title_cn": "BIS 季度评论",
                "duration_sec": 75,
                "chapters": [],
            }
        },
    }
    return base[module_type]


INSTITUTIONS = [
    ("inst_wgc", "世界黄金协会", "World Gold Council", "industry_org", "tier_1", "https://www.gold.org/", ["贵金属", "央行"]),
    ("inst_imf", "国际货币基金组织", "International Monetary Fund", "international_org", "tier_1", "https://www.imf.org/", ["宏观金融", "外汇"]),
    ("inst_world_bank", "世界银行", "World Bank", "international_org", "tier_1", "https://www.worldbank.org/", ["大宗商品", "发展经济"]),
    ("inst_iea", "国际能源署", "International Energy Agency", "international_org", "tier_1", "https://www.iea.org/", ["能源", "原油"]),
    ("inst_eia", "美国能源信息署", "U.S. Energy Information Administration", "official", "tier_1", "https://www.eia.gov/", ["能源", "原油"]),
    ("inst_usgs", "美国地质调查局", "U.S. Geological Survey", "official", "tier_1", "https://www.usgs.gov/", ["矿产", "贵金属"]),
    ("inst_ecb", "欧洲央行", "European Central Bank", "official", "tier_1", "https://www.ecb.europa.eu/", ["货币政策", "欧元区"]),
    ("inst_bis", "国际清算银行", "Bank for International Settlements", "international_org", "tier_1", "https://www.bis.org/", ["宏观金融", "金融稳定"]),
    ("inst_fed", "美联储", "Federal Reserve", "official", "tier_1", "https://www.federalreserve.gov/", ["货币政策", "美元"]),
    ("inst_opec", "欧佩克", "OPEC", "international_org", "tier_1", "https://www.opec.org/", ["能源", "原油"]),
    ("inst_ssga", "道富环球投资管理", "State Street Global Advisors", "asset_manager", "tier_2", "https://www.ssga.com/", ["贵金属", "跨资产"]),
    ("inst_wisdomtree", "WisdomTree", "WisdomTree", "asset_manager", "tier_2", "https://www.wisdomtree.com/", ["大宗商品", "资产配置"]),
    ("inst_ing", "ING 银行研究", "ING Think", "bank_research", "tier_2", "https://think.ing.com/", ["贵金属", "外汇"]),
    ("inst_silver_institute", "白银协会", "The Silver Institute", "industry_org", "tier_2", "https://silverinstitute.org/", ["白银", "矿产"]),
    ("inst_goldman", "高盛研究", "Goldman Sachs Research", "bank_research", "tier_3", "https://www.goldmansachs.com/", ["大宗商品", "宏观"]),
    ("inst_jpm", "摩根大通研究", "J.P. Morgan Research", "bank_research", "tier_3", "https://www.jpmorgan.com/", ["大宗商品", "宏观"]),
    ("inst_invesco", "景顺", "Invesco", "asset_manager", "tier_3", "https://www.invesco.com/", ["ETF", "资产配置"]),
    ("inst_pas", "泛美白银", "Pan American Silver", "partner", "tier_3", "https://www.panamericansilver.com/", ["白银", "矿业"]),
]


BASE_REPORTS = [
    (REAL_SAMPLE_REPORT_ID, "BIS 季度评论：全球金融市场重新校准", "inst_bis", "official_public", True, ["宏观金融", "金融稳定", "AI 融资", "非银风险"], "2026-06-02T00:00:00Z"),
    ("rep_ssga_gold", "黄金月报：金价新高之后，谁在继续买？", "inst_ssga", "authorized_partner", True, ["贵金属", "跨资产"], "2026-05-22T00:00:00Z"),
    ("rep_wb_pinksheet", "世界银行大宗商品价格表：金属分化继续", "inst_world_bank", "official_public", True, ["大宗商品", "金属"], "2026-05-20T00:00:00Z"),
    ("rep_iea_omr", "IEA 原油市场月报：库存与需求再平衡", "inst_iea", "official_public", True, ["能源", "原油"], "2026-05-18T00:00:00Z"),
    ("rep_ing_gold", "ING 黄金观点：实际利率回摆的压力测试", "inst_ing", "authorized_partner", False, ["贵金属", "外汇"], "2026-05-16T00:00:00Z"),
    ("rep_wisdomtree_outlook", "WisdomTree 商品展望：配置窗口与回撤风险", "inst_wisdomtree", "authorized_partner", False, ["大宗商品", "资产配置"], "2026-05-14T00:00:00Z"),
    ("rep_usgs_minerals", "USGS 矿产摘要：关键金属供给约束", "inst_usgs", "official_public", True, ["矿产", "贵金属"], "2026-05-12T00:00:00Z"),
    ("rep_pas_silver", "白银矿业更新：供给扰动与成本曲线", "inst_pas", "broker_public_gray", False, ["白银", "矿业"], "2026-05-10T00:00:00Z"),
    ("rep_eia_steo", "EIA 短期能源展望：油气价格情景", "inst_eia", "official_public", True, ["能源", "原油"], "2026-05-08T00:00:00Z"),
]

LIGHT_REPORTS = [
    ("rep_imf_weo", "IMF 世界经济展望：增长分化与政策空间", "inst_imf", "official_public", True, ["宏观金融"], "2026-05-06T00:00:00Z"),
    ("rep_bis_quarterly", "BIS 季报：市场重新校准", "inst_bis", "official_public", True, ["宏观金融", "金融稳定"], "2026-05-04T00:00:00Z"),
    ("rep_fed_fsr", "美联储金融稳定报告：杠杆与流动性", "inst_fed", "official_public", True, ["金融稳定"], "2026-05-02T00:00:00Z"),
    ("rep_ecb_bulletin", "欧洲央行经济公报：通胀路径更新", "inst_ecb", "official_public", True, ["货币政策"], "2026-04-30T00:00:00Z"),
    ("rep_opec_momr", "OPEC 月报：供需缺口与配额纪律", "inst_opec", "official_public", True, ["能源", "原油"], "2026-04-28T00:00:00Z"),
    ("rep_wgc_trends", "世界黄金协会：黄金需求趋势", "inst_wgc", "official_public", True, ["贵金属", "央行"], "2026-04-26T00:00:00Z"),
    ("rep_silver_survey", "白银协会：白银供需调查", "inst_silver_institute", "official_public", True, ["白银"], "2026-04-24T00:00:00Z"),
    ("rep_gs_commodity", "高盛商品观点：再通胀交易复盘", "inst_goldman", "broker_public_gray", False, ["大宗商品"], "2026-04-22T00:00:00Z"),
    ("rep_jpm_flows", "摩根大通资金流：商品 ETF 与风险偏好", "inst_jpm", "authorized_partner", False, ["跨资产"], "2026-04-20T00:00:00Z"),
    ("rep_invesco_etf", "景顺 ETF 观察：黄金与能源配置", "inst_invesco", "authorized_partner", False, ["ETF", "贵金属"], "2026-04-18T00:00:00Z"),
    ("rep_world_bank_macro", "世界银行宏观更新：贸易与大宗商品", "inst_world_bank", "official_public", True, ["宏观金融", "大宗商品"], "2026-04-16T00:00:00Z"),
    ("rep_iea_gas", "IEA 天然气市场报告：需求弹性", "inst_iea", "official_public", True, ["能源"], "2026-04-14T00:00:00Z"),
    ("rep_eia_inventory", "EIA 库存周报解读：裂解价差与需求", "inst_eia", "official_public", False, ["能源"], "2026-04-12T00:00:00Z"),
    ("rep_usgs_copper", "USGS 铜矿供给：项目延迟与品位下降", "inst_usgs", "official_public", False, ["矿产"], "2026-04-10T00:00:00Z"),
    ("rep_ing_fx", "ING 外汇周报：美元路径与黄金敏感性", "inst_ing", "authorized_partner", False, ["外汇", "贵金属"], "2026-04-08T00:00:00Z"),
    ("rep_wisdomtree_gold", "WisdomTree 黄金配置：避险与实际利率", "inst_wisdomtree", "authorized_partner", False, ["贵金属"], "2026-04-06T00:00:00Z"),
    ("rep_ecb_stability", "欧洲央行稳定评估：非银金融风险", "inst_ecb", "official_public", False, ["金融稳定"], "2026-04-04T00:00:00Z"),
    ("rep_bis_ai_credit", "BIS 专题：AI 融资与信用风险", "inst_bis", "official_public", False, ["金融稳定", "AI"], "2026-04-02T00:00:00Z"),
]


def module_envelope(module_type: str, report_id: str, title: str, institution_name: str, *, fallback: bool = False) -> dict[str, Any]:
    base = {
        "basic_info": {"content": {"report_id": report_id, "title_cn": title, "summary_cn": f"{title} 的基础信息，包含发布机构、发布时间、主题标签和来源层级。", "topics": ["贵金属"], "interpretation_label": "研报解读"}},
        "executive_overview": {
            "preview": {"preview_summary": f"{title} 的结构化摘要，聚焦核心结论、数据线索与风险边界。", "section_count": 3, "key_quote_snippet": "公开研报显示关键变量正在重新定价。"},
            "full": {"intro_cn": f"{title} 的执行摘要。", "sections": [{"heading": "核心结论", "body": "报告把需求、价格和风险拆成可读结构。"}, {"heading": "数据线索", "body": "关键指标用于判断趋势是否可持续。"}, {"heading": "风险边界", "body": "外部冲击和估值回摆仍可能改变短期路径。"}], "source_artifacts": ["native_briefing_doc", "native_blog_post"]},
        },
        "core_insights": {"content": {"points": [{"kind": "view", "text": "核心变量从情绪驱动转向结构驱动。"}, {"kind": "number", "text": "多项关键指标出现同步变化。"}, {"kind": "risk", "text": "若宏观假设反转，短期波动会放大。"}]}, "full": {"dimensions": [{"dimension": "需求结构", "summary": "机构、ETF 与产业需求变化共同影响价格。"}, {"dimension": "风险路径", "summary": "利率、美元和地缘冲击是主要风险因子。"}]}},
        "key_data": {"preview": {"preview_headline": "10 个关键数据点", "highlights": ["央行购金保持韧性", "ETF 资金重新流入", "库存周期出现分化"], "row_count": 10}, "full": {"rows": [{"metric": "样本指标", "value": "10", "unit": "项", "importance": "用于验证关键数据模块渲染", "judgment": "方向性信号清晰"}], "source_artifacts": ["data_table", "query_key_data"]}},
        "source_compliance": {"content": {"source_url": None if report_id == "rep_pas_silver" else "https://example.org/public-report", "source_note": "灰度来源仅展示来源说明，不提供原文链接。" if report_id == "rep_pas_silver" else "原文来源于机构公开研究页。", "copyright_cn": "内容基于机构公开研报的中文结构化解读。", "disclaimer": "本内容不构成投资建议。", "ai_generated_label": "AI 辅助生成"}},
        "differentiated_view": {"preview": {"preview_headline": "3 处与共识的关键分歧", "highlights": ["结构性买盘强于短期情绪", "库存周期解释部分价格韧性"], "divergence_count": 3}, "full": {"divergences": [{"topic": "买盘结构", "consensus_view": "价格主要由短期情绪驱动。", "report_position": "报告强调更稳定的结构性买盘。"}]}},
        "weaknesses": {"preview": {"preview_headline": "3 处质疑点与开放问题", "highlights": ["样本窗口偏短", "反方向证据仍需跟踪"], "item_count": 3, "disclaimer_brief": "AI 辅助论证质量分析"}, "full": {"disclaimer_cn": "仅供学习参考，不构成投资建议。", "verification_notes": ["这些开放问题需要结合后续数据、原文脚注和反方向证据继续验证。"], "items": [{"topic": "样本窗口", "weakness": "短周期数据可能放大结论。", "counter_evidence": "后续数据可能修正方向。"}]}},
        "timeline": {"preview": {"preview_headline": "5 个关键事件节点", "date_range": "2025-2026", "highlights": ["2026：价格重新定价", "2025：资金结构切换"], "event_count": 5}, "full": {"events": [{"date": "2026-05", "period_type": "review_period", "event": "报告发布", "impact": "为市场判断提供公开依据。"}]}},
        "study_guide": {"preview": {"preview_headline": "学习指南", "faq_count": 3, "glossary_count": 5, "sample_question": "这份报告适合谁读？"}, "full": {"intro_cn": "学习指南帮助读者理解术语和关键问题。", "faq_items": [{"question": "这份报告适合谁读？", "answer": "适合关注宏观、商品和资产配置的读者。"}], "glossary": [{"term": "source_tier", "definition": "来源可信层级。"}]}},
        "structure_graph": {"preview": {"preview_headline": "研报结构图", "root": f"{title}：分析框架", "top_nodes": ["需求", "价格", "风险"], "fallback_derived": fallback}, "full": {"root": f"{title}：分析框架", "nodes": [{"label": "需求", "children": ["机构", "产业", "投资"]}, {"label": "价格", "children": ["利率", "美元", "库存"]}], "fallback_derived": fallback, "source_artifacts": ["query_dimensions"] if fallback else ["mind_map"]}},
        "audio": {"content": {"audio_id": f"aud_{report_id.removeprefix('rep_')}", "title_cn": f"{title} 音频摘要", "duration_sec": 180, "chapters": []}},
    }
    return base[module_type]


def rich_module_types(report_id: str) -> list[str]:
    by_report = {
        REAL_SAMPLE_REPORT_ID: [
            "basic_info",
            "executive_overview",
            "core_insights",
            "key_data",
            "source_compliance",
            "institution",
            "differentiated_view",
            "weaknesses",
            "timeline",
            "study_guide",
            "structure_graph",
            "related_sources",
            "audio",
        ],
        "rep_ssga_gold": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution", "differentiated_view", "weaknesses", "timeline", "study_guide", "structure_graph", "audio"],
        "rep_wb_pinksheet": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution", "timeline", "study_guide", "audio"],
        "rep_iea_omr": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution", "study_guide", "structure_graph", "audio"],
        "rep_ing_gold": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution"],
        "rep_wisdomtree_outlook": ["basic_info", "executive_overview", "core_insights", "source_compliance", "institution", "timeline"],
        "rep_usgs_minerals": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution", "timeline", "structure_graph", "audio"],
        "rep_pas_silver": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution"],
        "rep_eia_steo": ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution", "study_guide", "audio"],
    }
    return by_report.get(report_id, ["basic_info", "executive_overview", "core_insights", "key_data", "source_compliance", "institution"])


async def reset(session: AsyncSession) -> None:
    for model in [
        OutboundEvent,
        PlaybackProgress,
        SavedListen,
        ReadingHistory,
        Favorite,
        User,
        RelatedNews,
        AudioAsset,
        DisplayModule,
        DisplayArtifact,
        RawArtifact,
        Report,
        Institution,
    ]:
        await session.execute(delete(model))
    await session.commit()


async def import_seed(session: AsyncSession) -> None:
    await reset(session)
    inst_lookup: dict[str, str] = {}
    for inst_id, name_cn, name_en, inst_type, tier, url, topics in INSTITUTIONS:
        inst_lookup[inst_id] = name_cn
        session.add(Institution(institution_id=inst_id, name_cn=name_cn, name_en=name_en, institution_type=inst_type, source_tier=tier, website_url=url, covered_topics=j(topics), intro_cn=f"{name_cn} 的公开研究和数据用于 Phase 1 seed 展示。", credibility_note=f"{name_cn} 是 {tier} 来源。", status="active"))
    await session.flush()

    all_reports = BASE_REPORTS + LIGHT_REPORTS
    audio_report_ids = {report_id for report_id, *_rest, has_audio, _topics, _date in all_reports if has_audio}

    for idx, (report_id, title, inst_id, source_tier, has_audio, topics, released) in enumerate(all_reports, start=1):
        display_status = "draft" if report_id == "rep_wisdomtree_outlook" else "published"
        source_url = None if source_tier == "broker_public_gray" else "https://example.org/public-report"
        source_note = "灰度公开来源，仅保留来源说明，不做默认音频化。" if source_tier == "broker_public_gray" else "原文来源于机构公开研究页。"
        if report_id == REAL_SAMPLE_REPORT_ID:
            source_url = "https://www.bis.org/publ/qtrpdf/r_qt2603.htm"
            source_note = "原文为 BIS Quarterly Review, March 2026 的公开研报。"
        session.add(
            Report(
                report_id=report_id,
                report_type="single",
                title_cn=title,
                subtitle_cn="",
                original_title="BIS Quarterly Review, March 2026" if report_id == REAL_SAMPLE_REPORT_ID else f"{title} original",
                one_liner="2025 年底至 2026 年初，全球金融市场在表面平静下出现资金流向切换，AI 融资、贵金属杠杆和非银风险成为主要线索。" if report_id == REAL_SAMPLE_REPORT_ID else f"{title} 的一分钟结构化摘要。",
                institution_id=inst_id,
                source_tier=source_tier,
                source_url=source_url,
                source_note=source_note,
                published_at=d(released),
                interpreted_at=d(released),
                released_at=d(released),
                topics=j(topics),
                language="en",
                has_audio=has_audio,
                display_status=display_status,
                display_version=1,
                cache_version=f"{report_id}:v1",
                risk_disclaimer="本内容为公开研报的结构化解读，不构成投资建议。",
                interpretation_label="研报解读",
            )
        )
        await session.flush()
        da_id = f"da_{report_id.removeprefix('rep_')}_v1"
        session.add(DisplayArtifact(display_artifact_id=da_id, report_id=report_id, display_version=1, title_cn=title, summary_cn=f"{title} seed display artifact", source_label=inst_lookup[inst_id], interpretation_label="研报解读", ai_generated_label="AI 辅助生成", synthesis_type="mixed" if has_audio else "text", source_disclosure_text=source_note, review_status="published", published_at=d(released)))
        await session.flush()

        artifact_types = sample_artifact_types() if report_id == REAL_SAMPLE_REPORT_ID else ["native_briefing_doc", "native_blog_post", "native_study_guide", "data_table", "query_dimensions", "query_key_data"]
        for artifact_type in artifact_types:
            session.add(RawArtifact(raw_artifact_id=f"raw_{report_id.removeprefix('rep_')}_{artifact_type}", report_id=report_id, artifact_type=artifact_type, payload_format="markdown" if artifact_type != "data_table" else "csv", status="ok", is_publish_blocking=artifact_type in {"native_briefing_doc", "native_blog_post", "data_table", "query_dimensions", "query_key_data"}, retention_status="retained", ingested_at=d(released)))
        if report_id == "rep_iea_omr":
            session.add(RawArtifact(raw_artifact_id="raw_iea_omr_mind_map", report_id=report_id, artifact_type="mind_map", payload_format="json", status="failed", error="Download failed for mind_map", is_publish_blocking=False, retention_status="retained", ingested_at=d(released)))

        module_types = [
            value
            for value in sorted(
                rich_module_types(report_id),
                key=lambda value: MODULE_DISPLAY_ORDER.get(value, len(MODULE_DISPLAY_ORDER)),
            )
            if value != "institution"
        ]
        for order, module_type in enumerate(module_types):
            if report_id == REAL_SAMPLE_REPORT_ID:
                payload = real_sample_module_envelope(module_type, report_id, title, inst_lookup[inst_id])
            else:
                payload = module_envelope(module_type, report_id, title, inst_lookup[inst_id], fallback=(report_id == "rep_iea_omr" and module_type == "structure_graph"))
            module_id = f"mod_{report_id.removeprefix('rep_')}_{module_type}"
            content_ref = f"rnb/modules/{module_id}.json" if "full" in payload else None
            session.add(
                DisplayModule(
                    module_id=module_id,
                    report_id=report_id,
                    display_artifact_id=da_id,
                    type=module_type,
                    title_cn=MODULE_TITLES.get(module_type, module_type),
                    content_format="json",
                    content=j(payload),
                    content_ref=content_ref,
                    content_etag=etag(payload),
                    source_raw_artifact_ids=j([]),
                    status="published" if display_status == "published" else "review",
                    sort_order=order,
                    version=1,
                )
            )

        if has_audio and report_id in audio_report_ids:
            audio_id = f"aud_{report_id.removeprefix('rep_')}"
            session.add(AudioAsset(audio_id=audio_id, report_id=report_id, title_cn=f"{title} 音频摘要", duration_sec=180 + idx, oss_key=f"rnb/audio/{audio_id}.m4a", chapters=j([]), status="published" if display_status == "published" else "review", published_at=d(released)))

        if idx <= 15:
            session.add(RelatedNews(related_news_id=f"news_{idx:03d}", report_id=report_id, title=f"{title} 延伸阅读", source_name="公开财经资讯", source_url="https://example.org/news", published_at=d(released), language="zh", summary_cn="整理自公开财经资讯的延伸阅读。", match_method="manual_curated", match_keywords=j(topics), match_confidence="medium", status="published"))

    await session.flush()
    for inst_id in inst_lookup:
        count = await session.scalar(select(Report).where(Report.institution_id == inst_id).count()) if False else None
        reports = (await session.execute(select(Report).where(Report.institution_id == inst_id, Report.display_status == "published").order_by(Report.released_at.desc()))).scalars().all()
        inst = (await session.execute(select(Institution).where(Institution.institution_id == inst_id))).scalar_one()
        inst.report_count = len(reports)
        if reports:
            inst.latest_report_id = reports[0].report_id
            inst.latest_report_at = reports[0].released_at

    users = [
        User(user_id="user_alpha", phone_hash="hash_alpha", display_name="Alpha", status="active"),
        User(user_id="user_history", phone_hash="hash_history", display_name="History", status="active"),
        User(user_id="user_guest_placeholder", display_name="Guest Placeholder", status="disabled"),
    ]
    session.add_all(users)
    await session.flush()
    for idx, report_id in enumerate(["rep_ssga_gold", "rep_wb_pinksheet", "rep_iea_omr", "rep_usgs_minerals", "rep_eia_steo"], start=1):
        session.add(Favorite(favorite_id=f"fav_{idx:03d}", user_id="user_alpha", report_id=report_id, status="active"))
    for idx, report_id in enumerate(["rep_ssga_gold", "rep_wb_pinksheet", "rep_iea_omr"], start=1):
        audio_id = f"aud_{report_id.removeprefix('rep_')}"
        session.add(PlaybackProgress(progress_id=f"prog_{idx:03d}", user_id="user_alpha", audio_id=audio_id, report_id=report_id, position_sec=idx * 30, duration_sec=180 + idx, completed=False))
    await session.commit()


async def main() -> None:
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)
    async with SessionLocal() as session:
        await import_seed(session)
    print("seed import complete")


if __name__ == "__main__":
    asyncio.run(main())