#!/usr/bin/env python3 """Extract Dune Awakening game data from saved dune.gaming.tools HTML pages. Outputs JSON files into ../data/: - character-xp.json - spec-{combat,crafting,exploration,gathering,sabotage}.json - faction-{atreides,harkonnen}.json - skills-{benegesserit,mentat,planetologist,swordmaster,trooper}.json - index.json (manifest) """ import json import re from html.parser import HTMLParser from pathlib import Path SAMPLE = Path(__file__).resolve().parents[2] / "sample-data" OUT = Path(__file__).resolve().parents[1] / "data" OUT.mkdir(parents=True, exist_ok=True) # ---------- generic extractor ---------- class TableExtractor(HTMLParser): def __init__(self): super().__init__() self.in_table = False self.in_row = False self.in_cell = False self.current_row = [] self.current_cell = "" self.rows = [] self.header = [] def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == "table" and "datatable" in (attrs.get("class") or ""): self.in_table = True elif self.in_table and tag == "tr": self.in_row = True self.current_row = [] elif self.in_table and tag in ("td", "th"): self.in_cell = True self.current_cell = "" def handle_endtag(self, tag): if tag == "table": self.in_table = False elif tag == "tr" and self.in_row: if self.current_row: self.rows.append(self.current_row) self.in_row = False elif tag in ("td", "th") and self.in_cell: self.current_row.append(self.current_cell.strip()) self.in_cell = False def handle_data(self, data): if self.in_cell: self.current_cell += data def parse_table(path: Path): p = TableExtractor() p.feed(path.read_text()) return p.rows def to_int(s: str) -> int: cleaned = re.sub(r"[^\d-]", "", s or "") if not cleaned or cleaned == "-": return 0 try: return int(cleaned) except ValueError: return 0 def extract_xp_table(path: Path, value_keys: list[str]) -> list[dict]: """For tables shaped: [Level | XP Required | Total XP | ...]. value_keys names the columns after Level.""" rows = parse_table(path) if not rows: return [] header = rows[0] out = [] for r in rows[1:]: if not r or not r[0].strip(): continue try: lvl = to_int(r[0]) except Exception: continue entry = {"level": lvl} for i, key in enumerate(value_keys, start=1): if i < len(r): entry[key] = to_int(r[i]) out.append(entry) return {"header": header, "rows": out} # ---------- specialization perks ---------- # Spec tables have a "Rewards" ' r'' r'' r'', re.DOTALL, ) PERK_BLOCK_RE = re.compile( r'
(?P.*?)
\s*', re.DOTALL, ) PERK_NAME_RE = re.compile(r'([^<]+)') PERK_COST_RE = re.compile( r'([^<]+)' ) PERK_DESC_RE = re.compile( r'
]*>([^<]+)
' ) PERK_EFFECT_RE = re.compile( r'
]*>([^<]+)
' ) PERK_BONUS_RE = re.compile( r'
]*>([^<]+)
' ) PERK_ICON_RE = re.compile(r'src="\./[^"]*/(t_ui_[^"]+\.webp)"') def extract_spec_perks(path: Path) -> dict[int, list[dict]]: """Return {level: [perk, ...]} for a specialization track HTML.""" html = path.read_text() out: dict[int, list[dict]] = {} for row_m in SPEC_ROW_RE.finditer(html): lvl = int(row_m.group("lvl")) rewards_html = row_m.group("rewards") # rows with no perks render as - if 'text-gray-500' in rewards_html and 'icon-container' not in rewards_html: continue perks: list[dict] = [] for blk_m in PERK_BLOCK_RE.finditer(rewards_html): inner = blk_m.group("inner") name = PERK_NAME_RE.search(inner) if not name: continue perk = {"name": name.group(1).strip()} cost = PERK_COST_RE.search(inner) if cost: perk["cost"] = cost.group(1).strip() desc = PERK_DESC_RE.search(inner) if desc: perk["description"] = desc.group(1).strip() effect = PERK_EFFECT_RE.search(inner) if effect: perk["effect"] = effect.group(1).strip() bonus = PERK_BONUS_RE.search(inner) if bonus: perk["bonus"] = bonus.group(1).strip() icon = PERK_ICON_RE.search(inner) if icon: perk["icon"] = icon.group(1) perks.append(perk) if perks: out[lvl] = perks return out # ---------- skill tree extractor ---------- NODE_RE = re.compile( r']*class="node[^"]*"[^>]*data-tag="(?PSkills\.[^"]+)"[^>]*' r'style="(?P
with rich HTML inside. Each row may grant # one or more perks; each perk has a name, optional cost badge, description, # optional effect line, optional cosmetic-unlock flag, and an icon. SPEC_ROW_RE = re.compile( r'
(?P\d+)[^<]*[^<]*(?P.*?)