From 8f83d8df5e43dbf72b56fd7f374bc7a97819b10f Mon Sep 17 00:00:00 2001 From: Padreug Date: Sun, 24 May 2026 16:40:11 +0200 Subject: [PATCH] feat(wiki): docs-lookup plugin against Quartz contentIndex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New maubot plugin that points at any Quartz-rendered docs site and answers chat queries by full-text searching its emitted /static/contentIndex.json. Default config targets docs.ariege.io (castle-docs). Commands: !ask search corpus; top-N hits with snippet + link !doc open a specific page (fuzzy title match) !wiki / !wiki refresh status / force re-index Architecture: - Periodic fetch (default 10 min) of /static/contentIndex.json - In-memory inverted-ish scoring: title hit 5pt, content hit 1pt + freq - No LLM — pure deterministic keyword search; RAG is future Phase 2b - No DB — index is upstream-derived cache, repopulates on bot restart Deployment posture: docs.ariege.io is served from cfaun alongside maubot, so the bot hits it over the host's internal network — works during WAN outages. base-config.yaml exposes docs_url + index_path for adopters pointing at their own site. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 1 + wiki/README.md | 117 +++++++++++++++++++++ wiki/base-config.yaml | 21 ++++ wiki/maubot.yaml | 9 ++ wiki/wiki.py | 234 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 382 insertions(+) create mode 100644 wiki/README.md create mode 100644 wiki/base-config.yaml create mode 100644 wiki/maubot.yaml create mode 100644 wiki/wiki.py diff --git a/README.md b/README.md index 35e778f..551177b 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ castle hosts; the actual plugin code lives here. |---|---| | [`journal/`](./journal/) | Farm-journal bot. `!journal ` records what you did, scoped per-user/room/timestamp. `!journal show [@user]` and `!journal today` query back. | | [`tracker/`](./tracker/) | Community-organizer bot. `!add` / `!task` / `!sidequest` / `!remind` / `!done` / `!list` / `!setup`. Implements the [Community Organizer spec](./docs/community-organizer-spec.md) — per-room shortcuts, 5-level priority, rules-based inbox classifier. | +| [`wiki/`](./wiki/) | Docs-lookup bot. `!ask ` / `!doc ` / `!wiki [refresh\|status]`. Points at any Quartz-rendered docs site (default: `docs.ariege.io`), full-text searches the corpus, replies with snippets + links. Internal-network deployment posture — works during WAN outages. | ## Community Organizer protocol diff --git a/wiki/README.md b/wiki/README.md new file mode 100644 index 0000000..31fea58 --- /dev/null +++ b/wiki/README.md @@ -0,0 +1,117 @@ +# wiki + +Documentation-lookup Matrix bot. Points at any +[Quartz](https://quartz.jzhao.xyz/)-rendered docs site, periodically +fetches its `contentIndex.json`, and answers queries in chat. + +Designed to be community-portable — works against any Quartz site you +configure it for, not just `docs.ariege.io`. Adjust `docs_url` per +instance. + +## Commands + +``` +!ask # full-text search the docs, top 3 with snippets +!doc # open a specific page (exact slug or fuzzy title) +!wiki # status: doc count, last refresh, source URL +!wiki refresh # force re-index now (admin nicety) +``` + +## Examples + +``` +!ask how do I shut the water off +!ask alpaca feeding winter +!ask power outage +!doc emergency/water-emergency +!doc water emergency # fuzzy title match works too +!wiki # are we up to date? +``` + +The bot replies with markdown links to the doc pages, so clicking +through opens the full doc in a browser. + +## How it works + +Quartz emits `/static/contentIndex.json` as part of its standard build +— a flat `{slug: {title, content, tags}}` map of every published page. +The plugin fetches that file on a timer (default every 10 minutes), +keeps an in-memory inverted index, and scores searches by: + +- Title hits: 5 points each +- Content hits: 1 point + 0.1 × frequency + +Top N (default 3) results come back with a short snippet around the +first match. **No LLM is involved** in v1 — pure deterministic keyword +search. Phase 2b / future work may add an LLM synthesis step (RAG) +once the inference layer is up. + +## Config + +`base-config.yaml` (override per maubot instance from the UI): + +```yaml +docs_url: https://docs.ariege.io # Quartz site base URL +index_path: /static/contentIndex.json # standard Quartz path +refresh_minutes: 10 # re-fetch cadence +max_results: 3 # !ask hit limit +snippet_chars: 160 # snippet window +site_name: Castle Docs # human-readable label in output +``` + +For internal-network deployments (the recommended posture — see below), +set `docs_url: http://` instead of the public URL. + +## Deployment posture (Château du Faune) + +Both `docs.ariege.io` and the maubot daemon run on **cfaun**. The bot +hits the docs site over the host's loopback / internal network, so: + +- No WAN dependency — the bot works during internet outages +- The fetch is fast (no TLS handshake to the public internet) +- If `docs.ariege.io` is down externally, the bot is unaffected +- Same applies if a future inference node (e.g. a ZeroClaw box) lives + on the internal network: it can hit the same internal URL + +If you're deploying elsewhere, point `docs_url` at whichever URL the +bot's host can actually reach. + +## Build + iterate + +```sh +cd ~/dev/maubot-plugins/wiki +zip -j ../wiki.mbp maubot.yaml base-config.yaml *.py +``` + +Upload via maubot UI → Plugins → click existing → upload new `.mbp`. +**Hit Save on the instance** after upload (the standard maubot +facepalm). For a new instance, edit the config to point at your docs +site and save. + +## Known limitations (v1) + +- **No LLM synthesis.** Returns matched passages, not a synthesized + answer. RAG (`!ask` → cited synthesized answer) is the natural Phase + 2b enhancement when the inference node is live. +- **Stopwords are minimal.** A query like "how do I" mostly matches + stopwords and may return weak results — phrase queries with the + actual content words ("water shutoff", "winter feeding"). +- **No spell correction on content terms.** Title fuzzy match works + for `!doc`; for `!ask` you need to spell the keywords correctly. +- **No personalization.** Everyone in the room sees the same hits. +- **No multi-site support per plugin instance.** One Quartz site per + maubot instance — to serve a second docs source, install a second + instance with a different config. + +## Adopting for a different docs site + +This plugin is intentionally protocol-agnostic at the content layer — +anything that emits a `{slug: {title, content}}` JSON map will work. +For non-Quartz docs sites, you can either: + +1. Adapt the upstream build to emit a compatible `contentIndex.json` +2. Fork this plugin's `_refresh()` to parse your site's index shape + +Common alternates worth considering for adopters: MkDocs (with the +mkdocs-material search plugin), Docusaurus, mdBook, or a custom +generator. diff --git a/wiki/base-config.yaml b/wiki/base-config.yaml new file mode 100644 index 0000000..60d6903 --- /dev/null +++ b/wiki/base-config.yaml @@ -0,0 +1,21 @@ +# Wiki lookup config. Point at any Quartz-emitted site: +# `docs_url` + `index_path` together resolve to the contentIndex.json +# the bot uses for search. Page links are constructed from docs_url + slug. + +docs_url: https://docs.ariege.io +index_path: /static/contentIndex.json + +# How often to re-fetch the content index, in minutes. Lower = fresher +# but more network chatter. Site refreshes typically happen on git push, +# so a few minutes lag is normal. +refresh_minutes: 10 + +# Max results returned per `!ask` query. +max_results: 3 + +# Snippet window around the first match in `!ask` output, in characters. +snippet_chars: 160 + +# Human-readable label for the docs site, used in bot output. +# E.g. "Castle Docs", "Co-op Wiki", "Operations Manual". +site_name: Castle Docs diff --git a/wiki/maubot.yaml b/wiki/maubot.yaml new file mode 100644 index 0000000..aa1eebe --- /dev/null +++ b/wiki/maubot.yaml @@ -0,0 +1,9 @@ +maubot: 0.1.0 +id: dev.aiolabs.wiki +version: 0.1.0 +license: AGPL-3.0-or-later +modules: + - wiki +main_class: WikiBot +database: false +config: true diff --git a/wiki/wiki.py b/wiki/wiki.py new file mode 100644 index 0000000..b8e7948 --- /dev/null +++ b/wiki/wiki.py @@ -0,0 +1,234 @@ +import asyncio +import difflib +import re +import time +from typing import Optional + +from maubot import MessageEvent, Plugin +from maubot.handlers import command +from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper + + +class Config(BaseProxyConfig): + def do_update(self, helper: ConfigUpdateHelper) -> None: + helper.copy("docs_url") + helper.copy("index_path") + helper.copy("refresh_minutes") + helper.copy("max_results") + helper.copy("snippet_chars") + helper.copy("site_name") + + +_CMD_RE = re.compile(r"^!(ask|doc|wiki)(?:[ \t\r\n]+(.*))?$", re.DOTALL) +_TOKEN_RE = re.compile(r"[a-z0-9]+", re.IGNORECASE) +_STOPWORDS = frozenset({ + "a", "an", "and", "as", "at", "be", "by", "for", "from", "how", "i", + "in", "is", "it", "of", "on", "or", "than", "that", "the", "this", + "to", "was", "what", "when", "where", "which", "who", "why", "with", + "do", "does", "did", "are", "we", "you", "our", "my", "me", +}) + + +def _tokens(s: str) -> list[str]: + return [t.lower() for t in _TOKEN_RE.findall(s) if t.lower() not in _STOPWORDS] + + +def _make_url(base: str, slug: str) -> str: + return f"{base.rstrip('/')}/{slug.lstrip('/')}" + + +def _snippet(content: str, terms: list[str], width: int) -> str: + if not content: + return "" + lc = content.lower() + first = -1 + matched_term = None + for t in terms: + idx = lc.find(t) + if idx != -1 and (first == -1 or idx < first): + first = idx + matched_term = t + if first == -1: + return content[:width].strip() + ("…" if len(content) > width else "") + start = max(0, first - width // 2) + end = min(len(content), start + width) + chunk = content[start:end].strip().replace("\n", " ") + if matched_term: + chunk = re.sub( + rf"(?i)\b({re.escape(matched_term)})\b", + r"**\1**", + chunk, + ) + prefix = "…" if start > 0 else "" + suffix = "…" if end < len(content) else "" + return f"{prefix}{chunk}{suffix}" + + +class WikiBot(Plugin): + config: Config + _index: dict + _slug_titles: list[tuple[str, str]] + _last_refresh: float + _refresh_task: Optional[asyncio.Task] + + @classmethod + def get_config_class(cls): + return Config + + async def start(self) -> None: + await super().start() + self.config.load_and_update() + self._index = {} + self._slug_titles = [] + self._last_refresh = 0.0 + self._refresh_task = asyncio.create_task(self._refresh_loop()) + + async def stop(self) -> None: + if self._refresh_task: + self._refresh_task.cancel() + await super().stop() + + async def on_external_config_update(self) -> None: + self.config.load_and_update() + + async def _refresh_loop(self) -> None: + try: + while True: + try: + await self._refresh() + except Exception: + self.log.exception("wiki refresh failed; will retry") + await asyncio.sleep(self.config["refresh_minutes"] * 60) + except asyncio.CancelledError: + raise + + async def _refresh(self) -> None: + url = _make_url(self.config["docs_url"], self.config["index_path"]) + async with self.http.get(url) as resp: + resp.raise_for_status() + data = await resp.json(content_type=None) + new_index = {} + slug_titles = [] + for slug, entry in data.items(): + title = (entry.get("title") or slug).strip() + content = entry.get("content") or "" + new_index[slug] = { + "title": title, + "content": content, + "tags": entry.get("tags") or [], + } + slug_titles.append((slug, title.lower())) + self._index = new_index + self._slug_titles = slug_titles + self._last_refresh = time.time() + self.log.info("wiki refresh: %d docs from %s", len(new_index), url) + + def _search(self, query: str, limit: int) -> list[tuple[float, str, dict]]: + terms = _tokens(query) + if not terms: + return [] + hits = [] + for slug, doc in self._index.items(): + title_lc = doc["title"].lower() + content_lc = doc["content"].lower() + score = 0.0 + for t in terms: + if t in title_lc: + score += 5.0 + if t in content_lc: + score += 1.0 + 0.1 * content_lc.count(t) + if score > 0: + hits.append((score, slug, doc)) + hits.sort(key=lambda x: x[0], reverse=True) + return hits[:limit] + + def _lookup(self, query: str) -> tuple[Optional[str], Optional[dict]]: + q = query.strip().lower() + if not q: + return None, None + if q in self._index: + return q, self._index[q] + for slug in self._index: + if slug.lower().endswith("/" + q) or slug.lower() == q: + return slug, self._index[slug] + candidates = [t for _, t in self._slug_titles] + match = difflib.get_close_matches(q, candidates, n=1, cutoff=0.6) + if match: + for slug, title in self._slug_titles: + if title == match[0]: + return slug, self._index[slug] + return None, None + + @command.passive(regex=_CMD_RE) + async def dispatch(self, evt: MessageEvent, match) -> None: + verb = match[1].lower() + body = (match[2] or "").strip() + if verb == "ask": + await self._handle_ask(evt, body) + elif verb == "doc": + await self._handle_doc(evt, body) + elif verb == "wiki": + await self._handle_wiki(evt, body) + + async def _handle_ask(self, evt: MessageEvent, body: str) -> None: + if not body: + await evt.reply( + f"Usage: `!ask ` — search {self.config['site_name']}." + ) + return + if not self._index: + await evt.reply("Wiki index isn't ready yet; try again in a moment.") + return + hits = self._search(body, self.config["max_results"]) + if not hits: + await evt.reply(f"No matches in {self.config['site_name']} for that.") + return + terms = _tokens(body) + snippet_chars = self.config["snippet_chars"] + lines = [f"**{self.config['site_name']} — {len(hits)} match(es):**"] + for score, slug, doc in hits: + url = _make_url(self.config["docs_url"], slug) + snip = _snippet(doc["content"], terms, snippet_chars) + lines.append(f"- **[{doc['title']}]({url})** — {snip}") + await evt.reply("\n".join(lines)) + + async def _handle_doc(self, evt: MessageEvent, body: str) -> None: + if not body: + await evt.reply("Usage: `!doc ` — open a specific page.") + return + if not self._index: + await evt.reply("Wiki index isn't ready yet; try again in a moment.") + return + slug, doc = self._lookup(body) + if slug is None or doc is None: + await evt.reply( + f"No page matches `{body}`. Try `!ask {body}` for a fuzzy search." + ) + return + url = _make_url(self.config["docs_url"], slug) + snippet_chars = self.config["snippet_chars"] * 2 + preview = (doc["content"] or "").strip().replace("\n", " ") + if len(preview) > snippet_chars: + preview = preview[:snippet_chars].rstrip() + "…" + await evt.reply(f"**[{doc['title']}]({url})**\n{preview}") + + async def _handle_wiki(self, evt: MessageEvent, body: str) -> None: + body = body.strip().lower() + if body == "refresh": + try: + await self._refresh() + await evt.reply(f"🔄 Refreshed {len(self._index)} docs.") + except Exception as e: + await evt.reply(f"Refresh failed: {e}") + return + if body in ("", "status"): + age = int(time.time() - self._last_refresh) if self._last_refresh else None + age_str = f"{age}s ago" if age is not None else "never" + await evt.reply( + f"**{self.config['site_name']}** — {len(self._index)} docs, " + f"last refresh: {age_str}\n" + f"Source: {_make_url(self.config['docs_url'], self.config['index_path'])}\n" + f"Use `!ask ` or `!doc `." + ) + return + await evt.reply("Usage: `!wiki` (status) or `!wiki refresh` (re-index).")