dune-tools/character-builder/scripts/extract.py

#!/usr/bin/env python3
"""Extract Dune Awakening game data from saved dune.gaming.tools HTML pages.

Outputs JSON files into ../data/:
  - character-xp.json
  - spec-{combat,crafting,exploration,gathering,sabotage}.json
  - faction-{atreides,harkonnen}.json
  - skills-{benegesserit,mentat,planetologist,swordmaster,trooper}.json
  - index.json  (manifest)

Also copies every referenced icon webp from sample-data/*/  into
../frontend/public/icons/ so the SPA can serve them as /icons/<name>.webp.
"""
import json
import re
import shutil
from html.parser import HTMLParser
from pathlib import Path

SAMPLE = Path(__file__).resolve().parents[2] / "sample-data"
OUT = Path(__file__).resolve().parents[1] / "data"
ICONS_OUT = Path(__file__).resolve().parents[1] / "frontend" / "public" / "icons"
OUT.mkdir(parents=True, exist_ok=True)
ICONS_OUT.mkdir(parents=True, exist_ok=True)


def find_icon_source(name: str) -> Path | None:
    """Locate an icon webp inside any sample-data/*_files/ subdirectory."""
    for sub in SAMPLE.iterdir():
        if sub.is_dir() and sub.name.endswith("_files"):
            candidate = sub / name
            if candidate.exists():
                return candidate
    return None


def copy_icons(names: set[str]) -> tuple[int, list[str]]:
    """Copy referenced icons into ICONS_OUT. Returns (copied, missing)."""
    copied = 0
    missing: list[str] = []
    for n in names:
        src = find_icon_source(n)
        if not src:
            missing.append(n)
            continue
        dst = ICONS_OUT / n
        if not dst.exists() or src.stat().st_size != dst.stat().st_size:
            shutil.copy2(src, dst)
        copied += 1
    return copied, missing


# ---------- generic <table class="datatable"> extractor ----------
class TableExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_table = False
        self.in_row = False
        self.in_cell = False
        self.current_row = []
        self.current_cell = ""
        self.rows = []
        self.header = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "table" and "datatable" in (attrs.get("class") or ""):
            self.in_table = True
        elif self.in_table and tag == "tr":
            self.in_row = True
            self.current_row = []
        elif self.in_table and tag in ("td", "th"):
            self.in_cell = True
            self.current_cell = ""

    def handle_endtag(self, tag):
        if tag == "table":
            self.in_table = False
        elif tag == "tr" and self.in_row:
            if self.current_row:
                self.rows.append(self.current_row)
            self.in_row = False
        elif tag in ("td", "th") and self.in_cell:
            self.current_row.append(self.current_cell.strip())
            self.in_cell = False

    def handle_data(self, data):
        if self.in_cell:
            self.current_cell += data


def parse_table(path: Path):
    p = TableExtractor()
    p.feed(path.read_text())
    return p.rows


def to_int(s: str) -> int:
    cleaned = re.sub(r"[^\d-]", "", s or "")
    if not cleaned or cleaned == "-":
        return 0
    try:
        return int(cleaned)
    except ValueError:
        return 0


def extract_xp_table(path: Path, value_keys: list[str]) -> list[dict]:
    """For tables shaped: [Level | XP Required | Total XP | ...].
    value_keys names the columns after Level."""
    rows = parse_table(path)
    if not rows:
        return []
    header = rows[0]
    out = []
    for r in rows[1:]:
        if not r or not r[0].strip():
            continue
        try:
            lvl = to_int(r[0])
        except Exception:
            continue
        entry = {"level": lvl}
        for i, key in enumerate(value_keys, start=1):
            if i < len(r):
                entry[key] = to_int(r[i])
        out.append(entry)
    return {"header": header, "rows": out}


# ---------- specialization perks ----------
# Spec tables have a "Rewards" <td> with rich HTML inside. Each row may grant
# one or more perks; each perk has a name, optional cost badge, description,
# optional effect line, optional cosmetic-unlock flag, and an icon.
SPEC_ROW_RE = re.compile(
    r'<tr><td class="text-center">(?P<lvl>\d+)</td>'
    r'<td class="text-center">[^<]*</td>'
    r'<td class="text-center">[^<]*</td>'
    r'<td>(?P<rewards>.*?)</td></tr>',
    re.DOTALL,
)
PERK_BLOCK_RE = re.compile(
    r'<div class="flex gap-3">(?P<inner>.*?)</div>\s*</div>',
    re.DOTALL,
)
PERK_NAME_RE = re.compile(r'<span class="font-medium">([^<]+)</span>')
PERK_COST_RE = re.compile(
    r'<span class="text-xs px-1\.5 py-0\.5 bg-tb-30 rounded">([^<]+)</span>'
)
PERK_DESC_RE = re.compile(
    r'<div class="text-sm text-neutral-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_EFFECT_RE = re.compile(
    r'<div class="text-sm text-green-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_BONUS_RE = re.compile(
    r'<div class="text-sm text-yellow-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_[^"]+\.webp)"')


def extract_spec_perks(path: Path) -> dict[int, list[dict]]:
    """Return {level: [perk, ...]} for a specialization track HTML."""
    html = path.read_text()
    out: dict[int, list[dict]] = {}
    for row_m in SPEC_ROW_RE.finditer(html):
        lvl = int(row_m.group("lvl"))
        rewards_html = row_m.group("rewards")
        # rows with no perks render as <span class="text-gray-500">-</span>
        if 'text-gray-500' in rewards_html and 'icon-container' not in rewards_html:
            continue
        perks: list[dict] = []
        for blk_m in PERK_BLOCK_RE.finditer(rewards_html):
            inner = blk_m.group("inner")
            name = PERK_NAME_RE.search(inner)
            if not name:
                continue
            perk = {"name": name.group(1).strip()}
            cost = PERK_COST_RE.search(inner)
            if cost:
                perk["cost"] = cost.group(1).strip()
            desc = PERK_DESC_RE.search(inner)
            if desc:
                perk["description"] = desc.group(1).strip()
            effect = PERK_EFFECT_RE.search(inner)
            if effect:
                perk["effect"] = effect.group(1).strip()
            bonus = PERK_BONUS_RE.search(inner)
            if bonus:
                perk["bonus"] = bonus.group(1).strip()
            icon = PERK_ICON_RE.search(inner)
            if icon:
                perk["icon"] = icon.group(1)
            perks.append(perk)
        if perks:
            out[lvl] = perks
    return out


# ---------- skill tree extractor ----------
NODE_RE = re.compile(
    r'<div\s+role="button"[^>]*class="node[^"]*"[^>]*data-tag="(?P<tag>Skills\.[^"]+)"[^>]*'
    r'style="(?P<style>[^"]*)"[^>]*>'
    r'(?P<inner>.*?)</div>\s*</div>',
    re.DOTALL,
)
ALT_RE = re.compile(r'alt="([^"]+)"')
HREF_RE = re.compile(r'href="(https://dune\.gaming\.tools/skills/[^"]+)"')
ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_icon[^"]+\.webp)"')
GRID_RE = re.compile(r"grid-area:\s*(\d+)\s*/\s*(\d+)")
MAX_PTS_RE = re.compile(r">0/(\d+)<")

# connector lines: parse SVG <line> within the tree container
LINE_RE = re.compile(
    r'<line[^>]*\sx1="(?P<x1>\d+)"[^>]*\sy1="(?P<y1>\d+)"[^>]*\sx2="(?P<x2>\d+)"[^>]*\sy2="(?P<y2>\d+)"'
)


SUBTREE_H3_RE = re.compile(
    r'<h3[^>]*class="[^"]*text-xl[^"]*"[^>]*>([^<]+)</h3>'
)
GRID_COLS_RE = re.compile(r"grid-template-columns:\s*repeat\((\d+),\s*72px\)")


def _extract_node(chunk: str, tag: str) -> dict | None:
    gm = GRID_RE.search(chunk)
    if not gm:
        return None
    row, col = int(gm.group(1)), int(gm.group(2))
    alt = ALT_RE.search(chunk)
    href = HREF_RE.search(chunk)
    icon = ICON_RE.search(chunk)
    max_pts = MAX_PTS_RE.search(chunk)
    kind = tag.split(".")[1] if "." in tag else "Unknown"
    return {
        "tag": tag,
        "id": tag.split(".")[-1],
        "name": alt.group(1) if alt else tag.split(".")[-1],
        "kind": kind,  # Ability | Attribute | Perk | Spice
        "row": row,
        "col": col,
        "maxPoints": int(max_pts.group(1)) if max_pts else 1,
        "icon": icon.group(1) if icon else None,
        "url": href.group(1) if href else None,
    }


def _map_edges(html_slice: str, nodes: list[dict]) -> list[dict]:
    """Pixel-match connector <line> endpoints to the nearest nodes in this
    subtree. Returns deduped edge list."""
    lines = list(LINE_RE.finditer(html_slice))
    if not lines or not nodes:
        return []
    all_x = [int(x) for ln in lines for x in (ln.group("x1"), ln.group("x2"))]
    all_y = [int(y) for ln in lines for y in (ln.group("y1"), ln.group("y2"))]
    min_x, max_x = min(all_x), max(all_x)
    min_y, max_y = min(all_y), max(all_y)
    cols = [n["col"] for n in nodes]
    rows_ = [n["row"] for n in nodes]
    min_c, max_c = min(cols), max(cols)
    min_r, max_r = min(rows_), max(rows_)
    sx = (max_x - min_x) / max(1, (max_c - min_c))
    sy = (max_y - min_y) / max(1, (max_r - min_r))

    centers = {
        n["tag"]: (min_x + (n["col"] - min_c) * sx, min_y + (n["row"] - min_r) * sy)
        for n in nodes
    }

    def nearest(x: int, y: int) -> str | None:
        best, best_d = None, float("inf")
        for tag, (cx, cy) in centers.items():
            d = (cx - x) ** 2 + (cy - y) ** 2
            if d < best_d:
                best_d, best = d, tag
        return best

    seen, edges = set(), []
    for ln in lines:
        x1, y1, x2, y2 = (int(ln.group(k)) for k in ("x1", "y1", "x2", "y2"))
        a, b = nearest(x1, y1), nearest(x2, y2)
        if a and b and a != b:
            key = tuple(sorted((a, b)))
            if key not in seen:
                seen.add(key)
                edges.append({"from": key[0], "to": key[1]})
    return edges


def extract_skill_tree(path: Path, class_id: str, class_name: str) -> dict:
    """Parse a class skill tree into its named subtrees.

    Each class is composed of 3 subtrees (e.g. Swordmaster has "The Blade",
    "The Will", "The Way"). Each subtree is its own CSS grid with its own
    column count, node positions, and connectors. Treating them as one big
    grid (the prior behavior) collapsed all 22 nodes on top of each other.
    """
    html = path.read_text()
    # Split on subtree H3 headers; first chunk is preamble.
    chunks = SUBTREE_H3_RE.split(html)
    preamble, pairs = chunks[0], chunks[1:]

    subtrees: list[dict] = []
    for i in range(0, len(pairs), 2):
        name = pairs[i].strip()
        body = pairs[i + 1] if i + 1 < len(pairs) else ""
        # Slice off anything that belongs to the next subtree (already handled
        # by split) or to trailing page chrome — search for the closing of the
        # graph div by counting from the start of the graph element.
        graph_start = body.find('<div class="graph svelte-1dvag2h"')
        if graph_start < 0:
            continue
        body = body[graph_start:]
        cols_m = GRID_COLS_RE.search(body)
        cols = int(cols_m.group(1)) if cols_m else 3

        # Parse nodes inside this subtree.
        nodes: list[dict] = []
        seen_tags: set[str] = set()
        for m in re.finditer(r'data-tag="(Skills\.[^"]+)"', body):
            tag = m.group(1)
            if tag in seen_tags:
                continue
            chunk = body[m.start() : m.start() + 2500]
            node = _extract_node(chunk, tag)
            if node:
                nodes.append(node)
                seen_tags.add(tag)

        edges = _map_edges(body, nodes)
        subtrees.append(
            {"name": name, "cols": cols, "nodes": nodes, "edges": edges}
        )

    return {"id": class_id, "name": class_name, "subtrees": subtrees}


# ---------- main ----------
def main():
    manifest = {"xp": {}, "factions": {}, "skills": []}

    # Character XP (200 levels, 6 value columns). Character XP has no
    # "Rewards" column, but every level grants skill / intel points — we
    # synthesize a per-level "Level Reward" perk so the UI can show them.
    char_xp = extract_xp_table(
        SAMPLE / "Character XP Table - Dune Awakening.html",
        [
            "xpRequired",
            "totalXp",
            "skillPoints",
            "totalSkillPoints",
            "intelPoints",
            "totalIntelPoints",
        ],
    )
    for row in char_xp["rows"]:
        sp = row.get("skillPoints", 0) or 0
        ip = row.get("intelPoints", 0) or 0
        if sp == 0 and ip == 0:
            continue
        parts = []
        if sp > 0:
            parts.append(f"+{sp} Skill Point" + ("s" if sp > 1 else ""))
        if ip > 0:
            parts.append(f"+{ip} Intel Point" + ("s" if ip > 1 else ""))
        row["perks"] = [
            {
                "name": "Level Reward",
                "effect": " · ".join(parts),
            }
        ]
    (OUT / "character-xp.json").write_text(json.dumps(char_xp, indent=2))
    manifest["xp"]["character"] = "character-xp.json"

    # Specialization XP (5 tracks) — also extract perks at each level.
    specs = ["Combat", "Crafting", "Exploration", "Gathering", "Sabotage"]
    for spec in specs:
        src = SAMPLE / f"{spec} Track XP Table - Dune Awakening.html"
        data = extract_xp_table(
            src, ["xpRequired", "totalXp", "intelPoints", "totalIntelPoints"]
        )
        perks_by_level = extract_spec_perks(src)
        for row in data["rows"]:
            perks = perks_by_level.get(row["level"])
            if perks:
                row["perks"] = perks
        slug = spec.lower()
        (OUT / f"spec-{slug}.json").write_text(json.dumps(data, indent=2))
        manifest["xp"][slug] = f"spec-{slug}.json"

    # Faction standing — different shape: Tier# | TierName | Required Rep | Cumulative
    factions = [("Atreides", "atreides"), ("Harkonnen", "harkonnen")]
    for fac_name, fac_id in factions:
        rows = parse_table(
            SAMPLE / f"House {fac_name} Faction Standing Table - Dune Awakening.html"
        )
        header = rows[0] if rows else []
        tiers = []
        for r in rows[1:]:
            if not r or len(r) < 4:
                continue
            tiers.append(
                {
                    "tier": to_int(r[0]),
                    "name": r[1].strip(),
                    "standingRequired": to_int(r[2]),
                    "totalStanding": to_int(r[3]),
                }
            )
        (OUT / f"faction-{fac_id}.json").write_text(
            json.dumps({"header": header, "tiers": tiers}, indent=2)
        )
        manifest["factions"][fac_id] = f"faction-{fac_id}.json"

    # Skill trees
    classes = [
        ("Bene Gesserit", "benegesserit"),
        ("Mentat", "mentat"),
        ("Planetologist", "planetologist"),
        ("Swordmaster", "swordmaster"),
        ("Trooper", "trooper"),
    ]
    for cls_name, cls_id in classes:
        path = SAMPLE / f"Dune Awakening Skill Builder - {cls_name}.html"
        if not path.exists():
            print(f"!! missing {path.name}")
            continue
        tree = extract_skill_tree(path, cls_id, cls_name)
        (OUT / f"skills-{cls_id}.json").write_text(json.dumps(tree, indent=2))
        total_nodes = sum(len(st["nodes"]) for st in tree["subtrees"])
        total_edges = sum(len(st["edges"]) for st in tree["subtrees"])
        manifest["skills"].append(
            {
                "id": cls_id,
                "name": cls_name,
                "file": f"skills-{cls_id}.json",
                "subtrees": [st["name"] for st in tree["subtrees"]],
                "nodes": total_nodes,
                "edges": total_edges,
            }
        )

    # ---------- copy referenced icon webps ----------
    icon_names: set[str] = set()
    for cls_name, cls_id in classes:
        path = OUT / f"skills-{cls_id}.json"
        if path.exists():
            tree = json.loads(path.read_text())
            for st in tree["subtrees"]:
                for n in st["nodes"]:
                    if n.get("icon"):
                        icon_names.add(n["icon"])
    for spec in specs:
        slug = spec.lower()
        spec_data = json.loads((OUT / f"spec-{slug}.json").read_text())
        for r in spec_data["rows"]:
            for p in r.get("perks", []):
                if p.get("icon"):
                    icon_names.add(p["icon"])
    copied, missing = copy_icons(icon_names)

    # Slot background images for the global Abilities + Techniques loadout —
    # the source HTML references them from a CDN, but local copies live in
    # the per-class _files directories.
    for src_name, dst_name in [
        ("ability.png", "slot-ability.png"),
        ("technique.png", "slot-technique.png"),
    ]:
        src = find_icon_source(src_name)
        if src:
            shutil.copy2(src, ICONS_OUT / dst_name)

    manifest["icons"] = {
        "directory": "frontend/public/icons",
        "served_at": "/icons/",
        "count": copied,
        "missing": missing,
    }

    (OUT / "index.json").write_text(json.dumps(manifest, indent=2))

    # Print summary
    print("\n=== Extraction summary ===")
    cx = json.loads((OUT / "character-xp.json").read_text())
    print(f"character XP rows: {len(cx['rows'])}  cols: {cx['header']}")
    for spec in ["combat", "crafting", "exploration", "gathering", "sabotage"]:
        d = json.loads((OUT / f"spec-{spec}.json").read_text())
        print(f"  spec {spec:11s} rows: {len(d['rows'])}")
    for fac in ["atreides", "harkonnen"]:
        d = json.loads((OUT / f"faction-{fac}.json").read_text())
        print(f"  faction {fac:9s} tiers: {len(d['tiers'])}  cols: {d['header']}")
    for s in manifest["skills"]:
        print(f"  skills {s['id']:14s} nodes: {s['nodes']:3d}  edges: {s['edges']:3d}")
    print(
        f"  icons copied: {manifest['icons']['count']}"
        + (f"  missing: {len(manifest['icons']['missing'])}" if manifest['icons']['missing'] else "")
    )


if __name__ == "__main__":
    main()