dune-tools/character-builder/scripts/extract.py

#!/usr/bin/env python3
"""Extract Dune Awakening game data from saved dune.gaming.tools HTML pages.

Outputs JSON files into ../data/:
  - character-xp.json
  - spec-{combat,crafting,exploration,gathering,sabotage}.json
  - faction-{atreides,harkonnen}.json
  - skills-{benegesserit,mentat,planetologist,swordmaster,trooper}.json
  - index.json  (manifest)

Also copies every referenced icon webp from sample-data/*/  into
../frontend/public/icons/ so the SPA can serve them as /icons/<name>.webp.
"""
import json
import re
import shutil
from html.parser import HTMLParser
from pathlib import Path

SAMPLE = Path(__file__).resolve().parents[2] / "sample-data"
OUT = Path(__file__).resolve().parents[1] / "data"
ICONS_OUT = Path(__file__).resolve().parents[1] / "frontend" / "public" / "icons"
OUT.mkdir(parents=True, exist_ok=True)
ICONS_OUT.mkdir(parents=True, exist_ok=True)


def find_icon_source(name: str) -> Path | None:
    """Locate an icon webp inside any sample-data/*_files/ subdirectory."""
    for sub in SAMPLE.iterdir():
        if sub.is_dir() and sub.name.endswith("_files"):
            candidate = sub / name
            if candidate.exists():
                return candidate
    return None


def copy_icons(names: set[str]) -> tuple[int, list[str]]:
    """Copy referenced icons into ICONS_OUT. Returns (copied, missing)."""
    copied = 0
    missing: list[str] = []
    for n in names:
        src = find_icon_source(n)
        if not src:
            missing.append(n)
            continue
        dst = ICONS_OUT / n
        if not dst.exists() or src.stat().st_size != dst.stat().st_size:
            shutil.copy2(src, dst)
        copied += 1
    return copied, missing


# ---------- generic <table class="datatable"> extractor ----------
class TableExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_table = False
        self.in_row = False
        self.in_cell = False
        self.current_row = []
        self.current_cell = ""
        self.rows = []
        self.header = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "table" and "datatable" in (attrs.get("class") or ""):
            self.in_table = True
        elif self.in_table and tag == "tr":
            self.in_row = True
            self.current_row = []
        elif self.in_table and tag in ("td", "th"):
            self.in_cell = True
            self.current_cell = ""

    def handle_endtag(self, tag):
        if tag == "table":
            self.in_table = False
        elif tag == "tr" and self.in_row:
            if self.current_row:
                self.rows.append(self.current_row)
            self.in_row = False
        elif tag in ("td", "th") and self.in_cell:
            self.current_row.append(self.current_cell.strip())
            self.in_cell = False

    def handle_data(self, data):
        if self.in_cell:
            self.current_cell += data


def parse_table(path: Path):
    p = TableExtractor()
    p.feed(path.read_text())
    return p.rows


def to_int(s: str) -> int:
    cleaned = re.sub(r"[^\d-]", "", s or "")
    if not cleaned or cleaned == "-":
        return 0
    try:
        return int(cleaned)
    except ValueError:
        return 0


def extract_xp_table(path: Path, value_keys: list[str]) -> list[dict]:
    """For tables shaped: [Level | XP Required | Total XP | ...].
    value_keys names the columns after Level."""
    rows = parse_table(path)
    if not rows:
        return []
    header = rows[0]
    out = []
    for r in rows[1:]:
        if not r or not r[0].strip():
            continue
        try:
            lvl = to_int(r[0])
        except Exception:
            continue
        entry = {"level": lvl}
        for i, key in enumerate(value_keys, start=1):
            if i < len(r):
                entry[key] = to_int(r[i])
        out.append(entry)
    return {"header": header, "rows": out}


# ---------- specialization perks ----------
# Spec tables have a "Rewards" <td> with rich HTML inside. Each row may grant
# one or more perks; each perk has a name, optional cost badge, description,
# optional effect line, optional cosmetic-unlock flag, and an icon.
SPEC_ROW_RE = re.compile(
    r'<tr><td class="text-center">(?P<lvl>\d+)</td>'
    r'<td class="text-center">[^<]*</td>'
    r'<td class="text-center">[^<]*</td>'
    r'<td>(?P<rewards>.*?)</td></tr>',
    re.DOTALL,
)
PERK_BLOCK_RE = re.compile(
    r'<div class="flex gap-3">(?P<inner>.*?)</div>\s*</div>',
    re.DOTALL,
)
PERK_NAME_RE = re.compile(r'<span class="font-medium">([^<]+)</span>')
PERK_COST_RE = re.compile(
    r'<span class="text-xs px-1\.5 py-0\.5 bg-tb-30 rounded">([^<]+)</span>'
)
PERK_DESC_RE = re.compile(
    r'<div class="text-sm text-neutral-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_EFFECT_RE = re.compile(
    r'<div class="text-sm text-green-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_BONUS_RE = re.compile(
    r'<div class="text-sm text-yellow-400[^"]*"[^>]*>([^<]+)</div>'
)
PERK_ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_[^"]+\.webp)"')


def extract_spec_perks(path: Path) -> dict[int, list[dict]]:
    """Return {level: [perk, ...]} for a specialization track HTML."""
    html = path.read_text()
    out: dict[int, list[dict]] = {}
    for row_m in SPEC_ROW_RE.finditer(html):
        lvl = int(row_m.group("lvl"))
        rewards_html = row_m.group("rewards")
        # rows with no perks render as <span class="text-gray-500">-</span>
        if 'text-gray-500' in rewards_html and 'icon-container' not in rewards_html:
            continue
        perks: list[dict] = []
        for blk_m in PERK_BLOCK_RE.finditer(rewards_html):
            inner = blk_m.group("inner")
            name = PERK_NAME_RE.search(inner)
            if not name:
                continue
            perk = {"name": name.group(1).strip()}
            cost = PERK_COST_RE.search(inner)
            if cost:
                perk["cost"] = cost.group(1).strip()
            desc = PERK_DESC_RE.search(inner)
            if desc:
                perk["description"] = desc.group(1).strip()
            effect = PERK_EFFECT_RE.search(inner)
            if effect:
                perk["effect"] = effect.group(1).strip()
            bonus = PERK_BONUS_RE.search(inner)
            if bonus:
                perk["bonus"] = bonus.group(1).strip()
            icon = PERK_ICON_RE.search(inner)
            if icon:
                perk["icon"] = icon.group(1)
            perks.append(perk)
        if perks:
            out[lvl] = perks
    return out


# ---------- skill tree extractor ----------
NODE_RE = re.compile(
    r'<div\s+role="button"[^>]*class="node[^"]*"[^>]*data-tag="(?P<tag>Skills\.[^"]+)"[^>]*'
    r'style="(?P<style>[^"]*)"[^>]*>'
    r'(?P<inner>.*?)</div>\s*</div>',
    re.DOTALL,
)
ALT_RE = re.compile(r'alt="([^"]+)"')
HREF_RE = re.compile(r'href="(https://dune\.gaming\.tools/skills/[^"]+)"')
ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_icon[^"]+\.webp)"')
GRID_RE = re.compile(r"grid-area:\s*(\d+)\s*/\s*(\d+)")
MAX_PTS_RE = re.compile(r">0/(\d+)<")

# connector lines: parse SVG <line> within the tree container
LINE_RE = re.compile(
    r'<line[^>]*\sx1="(?P<x1>\d+)"[^>]*\sy1="(?P<y1>\d+)"[^>]*\sx2="(?P<x2>\d+)"[^>]*\sy2="(?P<y2>\d+)"'
)


def extract_skill_tree(path: Path, class_id: str, class_name: str) -> dict:
    html = path.read_text()
    nodes = []
    # We need to also find the alt text + href + icon WITHIN each node's HTML.
    # Strategy: walk through all data-tag="Skills..." occurrences, slice from
    # opening of the node <div> to a balanced close. Simple slice: take 2000 chars
    # after the tag and parse first alt/href/icon/max within it.
    for m in re.finditer(r'data-tag="(Skills\.[^"]+)"', html):
        tag = m.group(1)
        start = m.start()
        chunk = html[start : start + 2500]
        gm = GRID_RE.search(chunk)
        if not gm:
            continue
        row, col = int(gm.group(1)), int(gm.group(2))
        alt = ALT_RE.search(chunk)
        href = HREF_RE.search(chunk)
        icon = ICON_RE.search(chunk)
        max_pts = MAX_PTS_RE.search(chunk)
        kind = tag.split(".")[1] if "." in tag else "Unknown"
        nodes.append(
            {
                "tag": tag,
                "id": tag.split(".")[-1],
                "name": alt.group(1) if alt else tag.split(".")[-1],
                "kind": kind,  # Ability | Attribute | Perk | Spice
                "row": row,
                "col": col,
                "maxPoints": int(max_pts.group(1)) if max_pts else 1,
                "icon": icon.group(1) if icon else None,
                "url": href.group(1) if href else None,
            }
        )

    # de-duplicate nodes by tag (the regex can match twice if the same tag appears in
    # a connector tooltip etc.)
    seen = {}
    for n in nodes:
        if n["tag"] not in seen:
            seen[n["tag"]] = n
    nodes = list(seen.values())

    # Build a position->node lookup. Grid is roughly square with ~73px cells based
    # on observed example (grid 3/5 -> center 364,220 means col*~73, row*~73 with offset).
    # We'll learn cell size from the data: if there are connectors, we map each (x,y) to
    # the nearest node by Euclidean distance.
    # First compute approximate node centers via grid math, calibrated from any node
    # we can pin: actually a more reliable approach is to use the connector geometry.
    edges = []
    lines = list(LINE_RE.finditer(html))
    if lines and nodes:
        # Calibrate: find scale by looking at min/max grid coords vs min/max line coords.
        all_x = [int(x) for ln in lines for x in (ln.group("x1"), ln.group("x2"))]
        all_y = [int(y) for ln in lines for y in (ln.group("y1"), ln.group("y2"))]
        min_x, max_x = min(all_x), max(all_x)
        min_y, max_y = min(all_y), max(all_y)
        cols = [n["col"] for n in nodes]
        rows = [n["row"] for n in nodes]
        min_c, max_c = min(cols), max(cols)
        min_r, max_r = min(rows), max(rows)
        # avoid div by zero
        sx = (max_x - min_x) / max(1, (max_c - min_c))
        sy = (max_y - min_y) / max(1, (max_r - min_r))

        def center(n):
            return (
                min_x + (n["col"] - min_c) * sx,
                min_y + (n["row"] - min_r) * sy,
            )

        centers = {n["tag"]: center(n) for n in nodes}

        def nearest(x, y):
            best_tag, best_d = None, float("inf")
            for t, (cx, cy) in centers.items():
                d = (cx - x) ** 2 + (cy - y) ** 2
                if d < best_d:
                    best_d = d
                    best_tag = t
            return best_tag

        seen_edges = set()
        for ln in lines:
            x1, y1, x2, y2 = (
                int(ln.group("x1")),
                int(ln.group("y1")),
                int(ln.group("x2")),
                int(ln.group("y2")),
            )
            a = nearest(x1, y1)
            b = nearest(x2, y2)
            if a and b and a != b:
                key = tuple(sorted((a, b)))
                if key not in seen_edges:
                    seen_edges.add(key)
                    edges.append({"from": key[0], "to": key[1]})

    return {
        "id": class_id,
        "name": class_name,
        "nodes": nodes,
        "edges": edges,
    }


# ---------- main ----------
def main():
    manifest = {"xp": {}, "factions": {}, "skills": []}

    # Character XP (200 levels, 6 value columns). Character XP has no
    # "Rewards" column, but every level grants skill / intel points — we
    # synthesize a per-level "Level Reward" perk so the UI can show them.
    char_xp = extract_xp_table(
        SAMPLE / "Character XP Table - Dune Awakening.html",
        [
            "xpRequired",
            "totalXp",
            "skillPoints",
            "totalSkillPoints",
            "intelPoints",
            "totalIntelPoints",
        ],
    )
    for row in char_xp["rows"]:
        sp = row.get("skillPoints", 0) or 0
        ip = row.get("intelPoints", 0) or 0
        if sp == 0 and ip == 0:
            continue
        parts = []
        if sp > 0:
            parts.append(f"+{sp} Skill Point" + ("s" if sp > 1 else ""))
        if ip > 0:
            parts.append(f"+{ip} Intel Point" + ("s" if ip > 1 else ""))
        row["perks"] = [
            {
                "name": "Level Reward",
                "effect": " · ".join(parts),
            }
        ]
    (OUT / "character-xp.json").write_text(json.dumps(char_xp, indent=2))
    manifest["xp"]["character"] = "character-xp.json"

    # Specialization XP (5 tracks) — also extract perks at each level.
    specs = ["Combat", "Crafting", "Exploration", "Gathering", "Sabotage"]
    for spec in specs:
        src = SAMPLE / f"{spec} Track XP Table - Dune Awakening.html"
        data = extract_xp_table(
            src, ["xpRequired", "totalXp", "intelPoints", "totalIntelPoints"]
        )
        perks_by_level = extract_spec_perks(src)
        for row in data["rows"]:
            perks = perks_by_level.get(row["level"])
            if perks:
                row["perks"] = perks
        slug = spec.lower()
        (OUT / f"spec-{slug}.json").write_text(json.dumps(data, indent=2))
        manifest["xp"][slug] = f"spec-{slug}.json"

    # Faction standing — different shape: Tier# | TierName | Required Rep | Cumulative
    factions = [("Atreides", "atreides"), ("Harkonnen", "harkonnen")]
    for fac_name, fac_id in factions:
        rows = parse_table(
            SAMPLE / f"House {fac_name} Faction Standing Table - Dune Awakening.html"
        )
        header = rows[0] if rows else []
        tiers = []
        for r in rows[1:]:
            if not r or len(r) < 4:
                continue
            tiers.append(
                {
                    "tier": to_int(r[0]),
                    "name": r[1].strip(),
                    "standingRequired": to_int(r[2]),
                    "totalStanding": to_int(r[3]),
                }
            )
        (OUT / f"faction-{fac_id}.json").write_text(
            json.dumps({"header": header, "tiers": tiers}, indent=2)
        )
        manifest["factions"][fac_id] = f"faction-{fac_id}.json"

    # Skill trees
    classes = [
        ("Bene Gesserit", "benegesserit"),
        ("Mentat", "mentat"),
        ("Planetologist", "planetologist"),
        ("Swordmaster", "swordmaster"),
        ("Trooper", "trooper"),
    ]
    for cls_name, cls_id in classes:
        path = SAMPLE / f"Dune Awakening Skill Builder - {cls_name}.html"
        if not path.exists():
            print(f"!! missing {path.name}")
            continue
        tree = extract_skill_tree(path, cls_id, cls_name)
        (OUT / f"skills-{cls_id}.json").write_text(json.dumps(tree, indent=2))
        manifest["skills"].append(
            {
                "id": cls_id,
                "name": cls_name,
                "file": f"skills-{cls_id}.json",
                "nodes": len(tree["nodes"]),
                "edges": len(tree["edges"]),
            }
        )

    # ---------- copy referenced icon webps ----------
    icon_names: set[str] = set()
    for cls_name, cls_id in classes:
        path = OUT / f"skills-{cls_id}.json"
        if path.exists():
            tree = json.loads(path.read_text())
            for n in tree["nodes"]:
                if n.get("icon"):
                    icon_names.add(n["icon"])
    for spec in specs:
        slug = spec.lower()
        spec_data = json.loads((OUT / f"spec-{slug}.json").read_text())
        for r in spec_data["rows"]:
            for p in r.get("perks", []):
                if p.get("icon"):
                    icon_names.add(p["icon"])
    copied, missing = copy_icons(icon_names)
    manifest["icons"] = {
        "directory": "frontend/public/icons",
        "served_at": "/icons/",
        "count": copied,
        "missing": missing,
    }

    (OUT / "index.json").write_text(json.dumps(manifest, indent=2))

    # Print summary
    print("\n=== Extraction summary ===")
    cx = json.loads((OUT / "character-xp.json").read_text())
    print(f"character XP rows: {len(cx['rows'])}  cols: {cx['header']}")
    for spec in ["combat", "crafting", "exploration", "gathering", "sabotage"]:
        d = json.loads((OUT / f"spec-{spec}.json").read_text())
        print(f"  spec {spec:11s} rows: {len(d['rows'])}")
    for fac in ["atreides", "harkonnen"]:
        d = json.loads((OUT / f"faction-{fac}.json").read_text())
        print(f"  faction {fac:9s} tiers: {len(d['tiers'])}  cols: {d['header']}")
    for s in manifest["skills"]:
        print(f"  skills {s['id']:14s} nodes: {s['nodes']:3d}  edges: {s['edges']:3d}")
    print(
        f"  icons copied: {manifest['icons']['count']}"
        + (f"  missing: {len(manifest['icons']['missing'])}" if manifest['icons']['missing'] else "")
    )


if __name__ == "__main__":
    main()