"""The §4 metadata cache and its two writers. Per §4: Gitea is truth. The cache mirrors only what the left pane and the read surfaces need, and it is rebuildable from Gitea at any time. Per §4.1: two writers — the webhook handler and the periodic reconciler — both read from Gitea and write to the cache. User actions never write to the cache directly; they trigger Git operations through the bot (`bot.py`), and the resulting webhook (or the next reconciler sweep) is what updates the cache. This module provides: - `refresh_meta_repo()` — reads rfcs/ on the meta repo and reconciles cached_rfcs against what's there. Used by both the webhook handler (on meta-repo merge events) and the reconciler. - `refresh_meta_pulls()` — reads open meta-repo PRs and reconciles cached_prs for pr_kind='idea' and friends. Backs the §7.3 pending-ideas disclosure. Per §4.2's "single SQLite file colocated with the FastAPI process," the cache writes happen on the same process that serves reads; lock contention is bounded by the small mutation surface (a few hundred rows at most for v1) and SQLite's WAL mode. """ from __future__ import annotations import asyncio import json import logging from . import db, entry as entry_mod from .config import Config from .gitea import Gitea, GiteaError log = logging.getLogger(__name__) async def refresh_meta_repo(config: Config, gitea: Gitea) -> None: """Re-read rfcs/ on the meta repo and reconcile cached_rfcs. Idempotent. Safe to call on every meta-repo webhook and on every reconciler sweep. """ org, repo = config.gitea_org, config.meta_repo try: files = await gitea.list_dir(org, repo, "rfcs", ref="main") except GiteaError as e: log.warning("refresh_meta_repo: cannot list rfcs/: %s", e) return seen_slugs: set[str] = set() for f in files: if f.get("type") != "file" or not f.get("name", "").endswith(".md"): continue result = await gitea.read_file(org, repo, f["path"], ref="main") if not result: continue text, sha = result try: entry = entry_mod.parse(text) except Exception as parse_err: log.warning("refresh_meta_repo: skipping %s: %s", f["path"], parse_err) continue if not entry.slug: log.warning("refresh_meta_repo: skipping %s: missing slug", f["path"]) continue seen_slugs.add(entry.slug) _upsert_cached_rfc(entry, body_sha=sha) # Mark entries removed from the meta repo as withdrawn-without-trace. # In practice the spec keeps withdrawn entries in rfcs/ as historical # record (§3), so this branch fires only for entries deleted out of # band. We leave the row but flag it for reconciler attention. existing = {row["slug"] for row in db.conn().execute("SELECT slug FROM cached_rfcs")} for missing in existing - seen_slugs: log.info("refresh_meta_repo: %s no longer in rfcs/ — leaving cache row in place", missing) def _upsert_cached_rfc(entry: entry_mod.Entry, body_sha: str) -> None: # §6.6: models_json stays NULL when the frontmatter key is absent # (inherit operator universe) and '[]' for the explicit opt-out. models_json = json.dumps(entry.models) if entry.models is not None else None db.conn().execute( """ INSERT INTO cached_rfcs (slug, title, state, rfc_id, repo, proposed_by, proposed_at, graduated_at, graduated_by, owners_json, arbiters_json, tags_json, models_json, body, body_sha, last_entry_commit_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now')) ON CONFLICT(slug) DO UPDATE SET title = excluded.title, state = excluded.state, rfc_id = excluded.rfc_id, repo = excluded.repo, proposed_by = excluded.proposed_by, proposed_at = excluded.proposed_at, graduated_at = excluded.graduated_at, graduated_by = excluded.graduated_by, owners_json = excluded.owners_json, arbiters_json = excluded.arbiters_json, tags_json = excluded.tags_json, models_json = excluded.models_json, body = excluded.body, body_sha = excluded.body_sha, last_entry_commit_at = datetime('now'), updated_at = datetime('now') """, ( entry.slug, entry.title, entry.state, entry.id, entry.repo, entry.proposed_by, entry.proposed_at, entry.graduated_at, entry.graduated_by, json.dumps(entry.owners), json.dumps(entry.arbiters), json.dumps(entry.tags), models_json, entry.body, body_sha, ), ) async def refresh_rfc_repo(config: Config, gitea: Gitea, slug: str) -> None: """Mirror an active RFC's per-RFC repo into the cache. Reads `RFC.md` on main into `cached_rfcs.body` (per §4 #3), lists branches into `cached_branches`, and lists open PRs into `cached_prs` with `pr_kind='rfc_branch'`. Per §4.1 this runs in two places: a webhook arrival for events on the per-RFC repo, and the reconciler sweep. """ row = db.conn().execute( "SELECT repo, state FROM cached_rfcs WHERE slug = ?", (slug,) ).fetchone() if not row or not row["repo"] or row["state"] != "active": return if "/" not in row["repo"]: log.warning("refresh_rfc_repo: %s has malformed repo %r", slug, row["repo"]) return owner, repo = row["repo"].split("/", 1) # Body on main — populates the discuss-mode default surface per §8.2. try: result = await gitea.read_file(owner, repo, "RFC.md", ref="main") except GiteaError as e: log.warning("refresh_rfc_repo(%s): read_file failed: %s", slug, e) result = None if result is not None: text, sha = result db.conn().execute( """ UPDATE cached_rfcs SET body = ?, body_sha = ?, last_main_commit_at = datetime('now'), updated_at = datetime('now') WHERE slug = ? """, (text, sha, slug), ) # Branches — every branch the bot knows about per §11.5 / §12. try: branches = await gitea.list_branches(owner, repo) except GiteaError as e: log.warning("refresh_rfc_repo(%s): list_branches failed: %s", slug, e) branches = [] seen_branches: set[str] = set() for b in branches: name = b.get("name") or "" if not name: continue seen_branches.add(name) head_sha = (b.get("commit") or {}).get("id") or "" last_commit_at = (b.get("commit") or {}).get("timestamp") db.conn().execute( """ INSERT INTO cached_branches (rfc_slug, branch_name, head_sha, state, last_commit_at) VALUES (?, ?, ?, 'open', ?) ON CONFLICT(rfc_slug, branch_name) DO UPDATE SET head_sha = excluded.head_sha, state = CASE WHEN cached_branches.state = 'closed' THEN 'closed' ELSE 'open' END, last_commit_at = excluded.last_commit_at """, (slug, name, head_sha, last_commit_at), ) # Mark previously-known branches that disappeared as deleted, keeping # the row per §11.5 ("branch removed from Gitea, row remains"). existing = { r["branch_name"] for r in db.conn().execute( "SELECT branch_name FROM cached_branches WHERE rfc_slug = ? AND state != 'deleted'", (slug,), ) } for missing in existing - seen_branches: db.conn().execute( "UPDATE cached_branches SET state = 'deleted' WHERE rfc_slug = ? AND branch_name = ?", (slug, missing), ) # PRs on the per-RFC repo (pr_kind = 'rfc_branch'). Slice 3 owns the # full PR surface; we mirror metadata here so the §8.1 breadcrumb # dropdown's "1 PR" count is honest from Slice 2 onward. repo_full = f"{owner}/{repo}" bot_login = config.gitea_bot_user try: open_pulls = await gitea.list_pulls(owner, repo, state="open") closed_pulls = await gitea.list_pulls(owner, repo, state="closed") except GiteaError as e: log.warning("refresh_rfc_repo(%s): list_pulls failed: %s", slug, e) open_pulls, closed_pulls = [], [] for pull in open_pulls + closed_pulls: head_branch = pull.get("head", {}).get("ref", "") state = _state_from_pull(pull) gitea_opener = (pull.get("user") or {}).get("login") or "" opened_by = _resolve_actor( gitea_opener, bot_login, slug, pull["number"], pull.get("body") or "", ) # §10.8: distinguish "user withdrew" from "Gitea closed for any # other reason." The bot's withdraw action lands in the actions # log; if we see it, surface state='withdrawn'. if state == "closed": withdrew = db.conn().execute( """ SELECT 1 FROM actions WHERE action_kind = 'withdraw_branch_pr' AND rfc_slug = ? AND pr_number = ? LIMIT 1 """, (slug, pull["number"]), ).fetchone() if withdrew: state = "withdrawn" merge_commit_sha = pull.get("merge_commit_sha") db.conn().execute( """ INSERT INTO cached_prs (rfc_slug, pr_kind, repo, pr_number, title, description, state, opened_by, opened_at, merged_at, closed_at, head_branch, base_branch, head_sha, merge_commit_sha) VALUES (?, 'rfc_branch', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(repo, pr_number) DO UPDATE SET title = excluded.title, description = excluded.description, state = excluded.state, opened_by = excluded.opened_by, merged_at = excluded.merged_at, closed_at = excluded.closed_at, head_sha = excluded.head_sha, merge_commit_sha = COALESCE(excluded.merge_commit_sha, cached_prs.merge_commit_sha) """, ( slug, repo_full, pull["number"], pull.get("title") or "", pull.get("body") or "", state, opened_by, pull.get("created_at"), pull.get("merged_at"), pull.get("closed_at"), head_branch, (pull.get("base") or {}).get("ref") or "main", (pull.get("head") or {}).get("sha"), merge_commit_sha, ), ) # §10.9: an explicit `Supersedes: #N` trailer on a merged PR's # body bumps the predecessor's state to closed and records the # supersession. The cache propagates this whether the merge came # via webhook or reconciler. if state == "merged": superseded = _parse_supersedes(pull.get("body") or "") if superseded: db.conn().execute( """ UPDATE cached_prs SET state = 'closed', superseded_by_pr_number = ?, closed_at = COALESCE(closed_at, datetime('now')) WHERE repo = ? AND pr_number = ? AND state = 'open' """, (pull["number"], repo_full, superseded), ) async def refresh_meta_branches(config: Config, gitea: Gitea) -> None: """Mirror the meta repo's branches into `cached_branches` for super-draft edit branches, plus a per-slug `main` row that records the meta-repo main's tip sha so the §10.1 has-commits-ahead check works uniformly across active and super-draft surfaces. Per the §5 super-draft scoping note, super-draft edits are branches on the meta repo. The naming Slice 4 picked is `edit--<6hex>` — structurally `edit//` per §9.5, with dashes in place of slashes per the §19.2 path-routing candidate. """ org, repo = config.gitea_org, config.meta_repo try: branches = await gitea.list_branches(org, repo) except GiteaError as e: log.warning("refresh_meta_branches: %s", e) return meta_main_sha = "" meta_main_ts = None edit_keys_seen: set[tuple[str, str]] = set() for b in branches: name = b.get("name") or "" head_sha = (b.get("commit") or {}).get("id") or "" last_commit_at = (b.get("commit") or {}).get("timestamp") if name == "main": meta_main_sha = head_sha meta_main_ts = last_commit_at continue slug = _slug_from_branch_name(name) if not slug: continue rfc = db.conn().execute( "SELECT state FROM cached_rfcs WHERE slug = ?", (slug,) ).fetchone() if not rfc or rfc["state"] != "super-draft": continue edit_keys_seen.add((slug, name)) db.conn().execute( """ INSERT INTO cached_branches (rfc_slug, branch_name, head_sha, state, last_commit_at) VALUES (?, ?, ?, 'open', ?) ON CONFLICT(rfc_slug, branch_name) DO UPDATE SET head_sha = excluded.head_sha, state = CASE WHEN cached_branches.state = 'closed' THEN 'closed' ELSE 'open' END, last_commit_at = excluded.last_commit_at """, (slug, name, head_sha, last_commit_at), ) # Synthesize a per-slug `main` row for every super-draft entry, so the # §10.1 has-commits-ahead check in api_prs.py works uniformly. The # head_sha is the meta-repo main's tip — every super-draft edit branch # diverges from this single point. if meta_main_sha: super_drafts = db.conn().execute( "SELECT slug FROM cached_rfcs WHERE state = 'super-draft'" ).fetchall() for r in super_drafts: db.conn().execute( """ INSERT INTO cached_branches (rfc_slug, branch_name, head_sha, state, last_commit_at) VALUES (?, 'main', ?, 'open', ?) ON CONFLICT(rfc_slug, branch_name) DO UPDATE SET head_sha = excluded.head_sha, last_commit_at = excluded.last_commit_at """, (r["slug"], meta_main_sha, meta_main_ts), ) # Mark previously-known edit branches that disappeared as deleted per # §11.5 / §12. Keep the row so chat history survives the branch's # deletion in Gitea. known = db.conn().execute( """ SELECT b.rfc_slug, b.branch_name FROM cached_branches b JOIN cached_rfcs r ON r.slug = b.rfc_slug WHERE r.state = 'super-draft' AND b.state != 'deleted' AND b.branch_name != 'main' """ ).fetchall() for k in known: if (k["rfc_slug"], k["branch_name"]) not in edit_keys_seen: db.conn().execute( "UPDATE cached_branches SET state = 'deleted' WHERE rfc_slug = ? AND branch_name = ?", (k["rfc_slug"], k["branch_name"]), ) def _slug_from_branch_name(name: str) -> str | None: """Mirror of `_slug_from_head_branch` for branch-only inputs (no PR body to consult).""" if name.startswith("edit-"): body = name[len("edit-") :] if "-" in body: slug, _hex = body.rsplit("-", 1) return slug or None if name.startswith("edit/"): parts = name.split("/", 2) if len(parts) >= 2: return parts[1] return None async def refresh_meta_pulls(config: Config, gitea: Gitea) -> None: """Reconcile open meta-repo PRs into cached_prs. For Slice 1 we care about pr_kind='idea' (proposing a new entry). Other meta-repo PR kinds (body edits, metadata edits, claims) will be wired in their respective slices. `opened_by` is the **underlying actor**, not the bot login Gitea reports — per §15.9's framing for notifications and per §6.5's On-behalf-of accountability shape. We recover the actor by joining against the `actions` audit log; if no row matches (cache rebuilt from scratch on a deployment that pre-dates the actions log, or a pull we did not author), we fall back to parsing the `On-behalf-of:` trailer from the PR body, then to the raw Gitea login as last resort. """ org, repo = config.gitea_org, config.meta_repo repo_full = f"{org}/{repo}" try: open_pulls = await gitea.list_pulls(org, repo, state="open") closed_pulls = await gitea.list_pulls(org, repo, state="closed") except GiteaError as e: log.warning("refresh_meta_pulls: %s", e) return bot_login = config.gitea_bot_user for pull in open_pulls + closed_pulls: head_branch = pull.get("head", {}).get("ref", "") slug = _slug_from_head_branch(head_branch) if slug is None: continue pr_kind = _kind_from_branch(head_branch) state = _state_from_pull(pull) gitea_opener = (pull.get("user") or {}).get("login") or "" opened_by = _resolve_actor( gitea_opener, bot_login, slug, pull["number"], pull.get("body") or "", ) # §10.8 / Slice 4: a closed body-edit PR may have been withdrawn # by the contributor. Distinguish from a generic Gitea close via # the audit log — same shape api_prs.py uses for rfc_branch PRs. if state == "closed" and pr_kind == "meta_body_edit": withdrew = db.conn().execute( """ SELECT 1 FROM actions WHERE action_kind = 'withdraw_branch_pr' AND rfc_slug = ? AND pr_number = ? LIMIT 1 """, (slug, pull["number"]), ).fetchone() if withdrew: state = "withdrawn" merge_commit_sha = pull.get("merge_commit_sha") db.conn().execute( """ INSERT INTO cached_prs (rfc_slug, pr_kind, repo, pr_number, title, description, state, opened_by, opened_at, merged_at, closed_at, head_branch, base_branch, head_sha, merge_commit_sha) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(repo, pr_number) DO UPDATE SET title = excluded.title, description = excluded.description, state = excluded.state, opened_by = excluded.opened_by, merged_at = excluded.merged_at, closed_at = excluded.closed_at, head_sha = excluded.head_sha, merge_commit_sha = COALESCE(excluded.merge_commit_sha, cached_prs.merge_commit_sha) """, ( slug, pr_kind, repo_full, pull["number"], pull.get("title") or "", pull.get("body") or "", state, opened_by, pull.get("created_at"), pull.get("merged_at"), pull.get("closed_at"), head_branch, (pull.get("base") or {}).get("ref") or "main", (pull.get("head") or {}).get("sha"), merge_commit_sha, ), ) _TRAILER_RE = None def _resolve_actor(gitea_opener: str, bot_login: str, slug: str, pr_number: int, body: str) -> str: """Best effort: collapse the bot's authorship to the underlying actor.""" if gitea_opener and gitea_opener != bot_login: return gitea_opener # Prefer the audit log. row = db.conn().execute( """ SELECT on_behalf_of FROM actions WHERE action_kind IN ('propose_rfc', 'open_body_edit_pr', 'open_branch_pr', 'open_claim_pr', 'open_metadata_pr') AND rfc_slug = ? AND pr_number = ? ORDER BY id LIMIT 1 """, (slug, pr_number), ).fetchone() if row and row["on_behalf_of"]: return row["on_behalf_of"] # Fall back to parsing the On-behalf-of trailer. import re as _re global _TRAILER_RE if _TRAILER_RE is None: _TRAILER_RE = _re.compile(r"On-behalf-of:\s+.*?<([^>]+)>", _re.MULTILINE) m = _TRAILER_RE.search(body) if m: return m.group(1) return gitea_opener or bot_login def _slug_from_head_branch(head_branch: str) -> str | None: if head_branch.startswith("propose/"): return head_branch[len("propose/") :] if head_branch.startswith("edit/"): parts = head_branch.split("/", 2) if len(parts) >= 2: return parts[1] if head_branch.startswith("edit-"): # §9.5 names the structural shape `edit//`, but # FastAPI's default {branch} path-segment matcher refuses slashes # (the §19.2 routing candidate). Slice 4 picks the same dash- # separated workaround Slice 2 used for promote-to-branch: # `edit--<6hex>`. The slug is the middle; the final # dash-segment is a 6-hex suffix. body = head_branch[len("edit-") :] if "-" in body: slug, _hex = body.rsplit("-", 1) return slug or None if head_branch.startswith("claim/"): return head_branch[len("claim/") :] if head_branch.startswith("metadata/"): return head_branch[len("metadata/") :] if head_branch.startswith("metadata-"): # §9.5 metadata-pane PRs use the same dash-separated branch shape # as edit branches, for the same routing reason. body = head_branch[len("metadata-") :] if "-" in body: slug, _hex = body.rsplit("-", 1) return slug or None return None def _kind_from_branch(head_branch: str) -> str: if head_branch.startswith("propose/"): return "idea" if head_branch.startswith("edit/") or head_branch.startswith("edit-"): return "meta_body_edit" if head_branch.startswith("claim/"): return "meta_claim" if head_branch.startswith("metadata/") or head_branch.startswith("metadata-"): return "meta_metadata" return "idea" # fallback _SUPERSEDES_RE = None def _parse_supersedes(body: str) -> int | None: """Parse a `Supersedes: #N` trailer from a PR body per §10.9.""" import re as _re global _SUPERSEDES_RE if _SUPERSEDES_RE is None: _SUPERSEDES_RE = _re.compile(r"^Supersedes:\s*#(\d+)", _re.MULTILINE) m = _SUPERSEDES_RE.search(body or "") return int(m.group(1)) if m else None def _state_from_pull(pull: dict) -> str: if pull.get("merged"): return "merged" if pull.get("state") == "closed": return "closed" return "open" # ----- Reconciler ----- class Reconciler: """Per §4.1: periodic safety-net sweep. Runs in the background, every five minutes by default. Catches up on any webhook the bot missed (downtime, network failure, Gitea flake). If the cache is corrupted, the reconciler rebuilds from scratch — that's the contract. """ def __init__(self, config: Config, gitea: Gitea, interval_seconds: int = 300): self._config = config self._gitea = gitea self._interval = interval_seconds self._task: asyncio.Task | None = None self._stop = asyncio.Event() async def _loop(self) -> None: # One sweep at startup, then on the interval. The startup sweep # is what brings a fresh cache to life on first boot. await self.sweep() while not self._stop.is_set(): try: await asyncio.wait_for(self._stop.wait(), timeout=self._interval) except asyncio.TimeoutError: pass if self._stop.is_set(): break await self.sweep() async def sweep(self) -> None: log.info("reconciler: starting sweep") try: await refresh_meta_repo(self._config, self._gitea) await refresh_meta_branches(self._config, self._gitea) await refresh_meta_pulls(self._config, self._gitea) # Per-RFC repos: refresh each active entry. Meta-repo refresh # must come first so newly-graduated entries land in # cached_rfcs before we try to reach their per-RFC repos. active = [ r["slug"] for r in db.conn().execute( "SELECT slug FROM cached_rfcs WHERE state = 'active' AND repo IS NOT NULL" ) ] for slug in active: await refresh_rfc_repo(self._config, self._gitea, slug) except Exception: log.exception("reconciler: sweep failed") else: log.info("reconciler: sweep complete") def start(self) -> None: if self._task is None: self._task = asyncio.create_task(self._loop()) async def stop(self) -> None: self._stop.set() if self._task is not None: await self._task self._task = None