feat(wiki): docs-lookup plugin against Quartz contentIndex

New maubot plugin that points at any Quartz-rendered docs site and answers chat queries by full-text searching its emitted /static/contentIndex.json. Default config targets docs.ariege.io (castle-docs). Commands: !ask <query> search corpus; top-N hits with snippet + link !doc <slug-or-title> open a specific page (fuzzy title match) !wiki / !wiki refresh status / force re-index Architecture: - Periodic fetch (default 10 min) of /static/contentIndex.json - In-memory inverted-ish scoring: title hit 5pt, content hit 1pt + freq - No LLM — pure deterministic keyword search; RAG is future Phase 2b - No DB — index is upstream-derived cache, repopulates on bot restart Deployment posture: docs.ariege.io is served from cfaun alongside maubot, so the bot hits it over the host's internal network — works during WAN outages. base-config.yaml exposes docs_url + index_path for adopters pointing at their own site. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 16:40:11 +02:00 · 2026-05-24 16:40:11 +02:00 · 8f83d8df5e
commit 8f83d8df5e
parent b7a096a77a
5 changed files with 382 additions and 0 deletions
--- a/wiki/wiki.py
+++ b/wiki/wiki.py
@ -0,0 +1,234 @@
+import asyncio
+import difflib
+import re
+import time
+from typing import Optional
+
+from maubot import MessageEvent, Plugin
+from maubot.handlers import command
+from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper
+
+
+class Config(BaseProxyConfig):
+    def do_update(self, helper: ConfigUpdateHelper) -> None:
+        helper.copy("docs_url")
+        helper.copy("index_path")
+        helper.copy("refresh_minutes")
+        helper.copy("max_results")
+        helper.copy("snippet_chars")
+        helper.copy("site_name")
+
+
+_CMD_RE = re.compile(r"^!(ask|doc|wiki)(?:[ \t\r\n]+(.*))?$", re.DOTALL)
+_TOKEN_RE = re.compile(r"[a-z0-9]+", re.IGNORECASE)
+_STOPWORDS = frozenset({
+    "a", "an", "and", "as", "at", "be", "by", "for", "from", "how", "i",
+    "in", "is", "it", "of", "on", "or", "than", "that", "the", "this",
+    "to", "was", "what", "when", "where", "which", "who", "why", "with",
+    "do", "does", "did", "are", "we", "you", "our", "my", "me",
+})
+
+
+def _tokens(s: str) -> list[str]:
+    return [t.lower() for t in _TOKEN_RE.findall(s) if t.lower() not in _STOPWORDS]
+
+
+def _make_url(base: str, slug: str) -> str:
+    return f"{base.rstrip('/')}/{slug.lstrip('/')}"
+
+
+def _snippet(content: str, terms: list[str], width: int) -> str:
+    if not content:
+        return ""
+    lc = content.lower()
+    first = -1
+    matched_term = None
+    for t in terms:
+        idx = lc.find(t)
+        if idx != -1 and (first == -1 or idx < first):
+            first = idx
+            matched_term = t
+    if first == -1:
+        return content[:width].strip() + ("…" if len(content) > width else "")
+    start = max(0, first - width // 2)
+    end = min(len(content), start + width)
+    chunk = content[start:end].strip().replace("\n", " ")
+    if matched_term:
+        chunk = re.sub(
+            rf"(?i)\b({re.escape(matched_term)})\b",
+            r"**\1**",
+            chunk,
+        )
+    prefix = "…" if start > 0 else ""
+    suffix = "…" if end < len(content) else ""
+    return f"{prefix}{chunk}{suffix}"
+
+
+class WikiBot(Plugin):
+    config: Config
+    _index: dict
+    _slug_titles: list[tuple[str, str]]
+    _last_refresh: float
+    _refresh_task: Optional[asyncio.Task]
+
+    @classmethod
+    def get_config_class(cls):
+        return Config
+
+    async def start(self) -> None:
+        await super().start()
+        self.config.load_and_update()
+        self._index = {}
+        self._slug_titles = []
+        self._last_refresh = 0.0
+        self._refresh_task = asyncio.create_task(self._refresh_loop())
+
+    async def stop(self) -> None:
+        if self._refresh_task:
+            self._refresh_task.cancel()
+        await super().stop()
+
+    async def on_external_config_update(self) -> None:
+        self.config.load_and_update()
+
+    async def _refresh_loop(self) -> None:
+        try:
+            while True:
+                try:
+                    await self._refresh()
+                except Exception:
+                    self.log.exception("wiki refresh failed; will retry")
+                await asyncio.sleep(self.config["refresh_minutes"] * 60)
+        except asyncio.CancelledError:
+            raise
+
+    async def _refresh(self) -> None:
+        url = _make_url(self.config["docs_url"], self.config["index_path"])
+        async with self.http.get(url) as resp:
+            resp.raise_for_status()
+            data = await resp.json(content_type=None)
+        new_index = {}
+        slug_titles = []
+        for slug, entry in data.items():
+            title = (entry.get("title") or slug).strip()
+            content = entry.get("content") or ""
+            new_index[slug] = {
+                "title": title,
+                "content": content,
+                "tags": entry.get("tags") or [],
+            }
+            slug_titles.append((slug, title.lower()))
+        self._index = new_index
+        self._slug_titles = slug_titles
+        self._last_refresh = time.time()
+        self.log.info("wiki refresh: %d docs from %s", len(new_index), url)
+
+    def _search(self, query: str, limit: int) -> list[tuple[float, str, dict]]:
+        terms = _tokens(query)
+        if not terms:
+            return []
+        hits = []
+        for slug, doc in self._index.items():
+            title_lc = doc["title"].lower()
+            content_lc = doc["content"].lower()
+            score = 0.0
+            for t in terms:
+                if t in title_lc:
+                    score += 5.0
+                if t in content_lc:
+                    score += 1.0 + 0.1 * content_lc.count(t)
+            if score > 0:
+                hits.append((score, slug, doc))
+        hits.sort(key=lambda x: x[0], reverse=True)
+        return hits[:limit]
+
+    def _lookup(self, query: str) -> tuple[Optional[str], Optional[dict]]:
+        q = query.strip().lower()
+        if not q:
+            return None, None
+        if q in self._index:
+            return q, self._index[q]
+        for slug in self._index:
+            if slug.lower().endswith("/" + q) or slug.lower() == q:
+                return slug, self._index[slug]
+        candidates = [t for _, t in self._slug_titles]
+        match = difflib.get_close_matches(q, candidates, n=1, cutoff=0.6)
+        if match:
+            for slug, title in self._slug_titles:
+                if title == match[0]:
+                    return slug, self._index[slug]
+        return None, None
+
+    @command.passive(regex=_CMD_RE)
+    async def dispatch(self, evt: MessageEvent, match) -> None:
+        verb = match[1].lower()
+        body = (match[2] or "").strip()
+        if verb == "ask":
+            await self._handle_ask(evt, body)
+        elif verb == "doc":
+            await self._handle_doc(evt, body)
+        elif verb == "wiki":
+            await self._handle_wiki(evt, body)
+
+    async def _handle_ask(self, evt: MessageEvent, body: str) -> None:
+        if not body:
+            await evt.reply(
+                f"Usage: `!ask <question>` — search {self.config['site_name']}."
+            )
+            return
+        if not self._index:
+            await evt.reply("Wiki index isn't ready yet; try again in a moment.")
+            return
+        hits = self._search(body, self.config["max_results"])
+        if not hits:
+            await evt.reply(f"No matches in {self.config['site_name']} for that.")
+            return
+        terms = _tokens(body)
+        snippet_chars = self.config["snippet_chars"]
+        lines = [f"**{self.config['site_name']} — {len(hits)} match(es):**"]
+        for score, slug, doc in hits:
+            url = _make_url(self.config["docs_url"], slug)
+            snip = _snippet(doc["content"], terms, snippet_chars)
+            lines.append(f"- **[{doc['title']}]({url})** — {snip}")
+        await evt.reply("\n".join(lines))
+
+    async def _handle_doc(self, evt: MessageEvent, body: str) -> None:
+        if not body:
+            await evt.reply("Usage: `!doc <slug-or-title>` — open a specific page.")
+            return
+        if not self._index:
+            await evt.reply("Wiki index isn't ready yet; try again in a moment.")
+            return
+        slug, doc = self._lookup(body)
+        if slug is None or doc is None:
+            await evt.reply(
+                f"No page matches `{body}`. Try `!ask {body}` for a fuzzy search."
+            )
+            return
+        url = _make_url(self.config["docs_url"], slug)
+        snippet_chars = self.config["snippet_chars"] * 2
+        preview = (doc["content"] or "").strip().replace("\n", " ")
+        if len(preview) > snippet_chars:
+            preview = preview[:snippet_chars].rstrip() + "…"
+        await evt.reply(f"**[{doc['title']}]({url})**\n{preview}")
+
+    async def _handle_wiki(self, evt: MessageEvent, body: str) -> None:
+        body = body.strip().lower()
+        if body == "refresh":
+            try:
+                await self._refresh()
+                await evt.reply(f"🔄 Refreshed {len(self._index)} docs.")
+            except Exception as e:
+                await evt.reply(f"Refresh failed: {e}")
+            return
+        if body in ("", "status"):
+            age = int(time.time() - self._last_refresh) if self._last_refresh else None
+            age_str = f"{age}s ago" if age is not None else "never"
+            await evt.reply(
+                f"**{self.config['site_name']}** — {len(self._index)} docs, "
+                f"last refresh: {age_str}\n"
+                f"Source: {_make_url(self.config['docs_url'], self.config['index_path'])}\n"
+                f"Use `!ask <query>` or `!doc <slug-or-title>`."
+            )
+            return
+        await evt.reply("Usage: `!wiki` (status) or `!wiki refresh` (re-index).")