#!/usr/bin/env python3
"""Extract Dune Awakening game data from saved dune.gaming.tools HTML pages.
Outputs JSON files into ../data/:
- character-xp.json
- spec-{combat,crafting,exploration,gathering,sabotage}.json
- faction-{atreides,harkonnen}.json
- skills-{benegesserit,mentat,planetologist,swordmaster,trooper}.json
- index.json (manifest)
"""
import json
import re
from html.parser import HTMLParser
from pathlib import Path
SAMPLE = Path(__file__).resolve().parents[2] / "sample-data"
OUT = Path(__file__).resolve().parents[1] / "data"
OUT.mkdir(parents=True, exist_ok=True)
# ---------- generic
extractor ----------
class TableExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.in_table = False
self.in_row = False
self.in_cell = False
self.current_row = []
self.current_cell = ""
self.rows = []
self.header = []
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if tag == "table" and "datatable" in (attrs.get("class") or ""):
self.in_table = True
elif self.in_table and tag == "tr":
self.in_row = True
self.current_row = []
elif self.in_table and tag in ("td", "th"):
self.in_cell = True
self.current_cell = ""
def handle_endtag(self, tag):
if tag == "table":
self.in_table = False
elif tag == "tr" and self.in_row:
if self.current_row:
self.rows.append(self.current_row)
self.in_row = False
elif tag in ("td", "th") and self.in_cell:
self.current_row.append(self.current_cell.strip())
self.in_cell = False
def handle_data(self, data):
if self.in_cell:
self.current_cell += data
def parse_table(path: Path):
p = TableExtractor()
p.feed(path.read_text())
return p.rows
def to_int(s: str) -> int:
cleaned = re.sub(r"[^\d-]", "", s or "")
if not cleaned or cleaned == "-":
return 0
try:
return int(cleaned)
except ValueError:
return 0
def extract_xp_table(path: Path, value_keys: list[str]) -> list[dict]:
"""For tables shaped: [Level | XP Required | Total XP | ...].
value_keys names the columns after Level."""
rows = parse_table(path)
if not rows:
return []
header = rows[0]
out = []
for r in rows[1:]:
if not r or not r[0].strip():
continue
try:
lvl = to_int(r[0])
except Exception:
continue
entry = {"level": lvl}
for i, key in enumerate(value_keys, start=1):
if i < len(r):
entry[key] = to_int(r[i])
out.append(entry)
return {"header": header, "rows": out}
# ---------- specialization perks ----------
# Spec tables have a "Rewards" | with rich HTML inside. Each row may grant
# one or more perks; each perk has a name, optional cost badge, description,
# optional effect line, optional cosmetic-unlock flag, and an icon.
SPEC_ROW_RE = re.compile(
r' | | (?P\d+) | '
r'[^<]* | '
r'[^<]* | '
r'(?P.*?) |
',
re.DOTALL,
)
PERK_BLOCK_RE = re.compile(
r'(?P.*?)
\s*',
re.DOTALL,
)
PERK_NAME_RE = re.compile(r'([^<]+)')
PERK_COST_RE = re.compile(
r'([^<]+)'
)
PERK_DESC_RE = re.compile(
r']*>([^<]+)
'
)
PERK_EFFECT_RE = re.compile(
r']*>([^<]+)
'
)
PERK_BONUS_RE = re.compile(
r']*>([^<]+)
'
)
PERK_ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_[^"]+\.webp)"')
def extract_spec_perks(path: Path) -> dict[int, list[dict]]:
"""Return {level: [perk, ...]} for a specialization track HTML."""
html = path.read_text()
out: dict[int, list[dict]] = {}
for row_m in SPEC_ROW_RE.finditer(html):
lvl = int(row_m.group("lvl"))
rewards_html = row_m.group("rewards")
# rows with no perks render as -
if 'text-gray-500' in rewards_html and 'icon-container' not in rewards_html:
continue
perks: list[dict] = []
for blk_m in PERK_BLOCK_RE.finditer(rewards_html):
inner = blk_m.group("inner")
name = PERK_NAME_RE.search(inner)
if not name:
continue
perk = {"name": name.group(1).strip()}
cost = PERK_COST_RE.search(inner)
if cost:
perk["cost"] = cost.group(1).strip()
desc = PERK_DESC_RE.search(inner)
if desc:
perk["description"] = desc.group(1).strip()
effect = PERK_EFFECT_RE.search(inner)
if effect:
perk["effect"] = effect.group(1).strip()
bonus = PERK_BONUS_RE.search(inner)
if bonus:
perk["bonus"] = bonus.group(1).strip()
icon = PERK_ICON_RE.search(inner)
if icon:
perk["icon"] = icon.group(1)
perks.append(perk)
if perks:
out[lvl] = perks
return out
# ---------- skill tree extractor ----------
NODE_RE = re.compile(
r']*class="node[^"]*"[^>]*data-tag="(?PSkills\.[^"]+)"[^>]*'
r'style="(?P