maubot-plugins/wiki/wiki.py
Padreug 8f83d8df5e feat(wiki): docs-lookup plugin against Quartz contentIndex
New maubot plugin that points at any Quartz-rendered docs site and
answers chat queries by full-text searching its emitted
/static/contentIndex.json. Default config targets docs.ariege.io
(castle-docs).

Commands:
  !ask <query>            search corpus; top-N hits with snippet + link
  !doc <slug-or-title>    open a specific page (fuzzy title match)
  !wiki / !wiki refresh   status / force re-index

Architecture:
- Periodic fetch (default 10 min) of /static/contentIndex.json
- In-memory inverted-ish scoring: title hit 5pt, content hit 1pt + freq
- No LLM — pure deterministic keyword search; RAG is future Phase 2b
- No DB — index is upstream-derived cache, repopulates on bot restart

Deployment posture: docs.ariege.io is served from cfaun alongside
maubot, so the bot hits it over the host's internal network — works
during WAN outages. base-config.yaml exposes docs_url + index_path
for adopters pointing at their own site.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 16:40:11 +02:00

234 lines
8.4 KiB
Python

import asyncio
import difflib
import re
import time
from typing import Optional
from maubot import MessageEvent, Plugin
from maubot.handlers import command
from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper
class Config(BaseProxyConfig):
def do_update(self, helper: ConfigUpdateHelper) -> None:
helper.copy("docs_url")
helper.copy("index_path")
helper.copy("refresh_minutes")
helper.copy("max_results")
helper.copy("snippet_chars")
helper.copy("site_name")
_CMD_RE = re.compile(r"^!(ask|doc|wiki)(?:[ \t\r\n]+(.*))?$", re.DOTALL)
_TOKEN_RE = re.compile(r"[a-z0-9]+", re.IGNORECASE)
_STOPWORDS = frozenset({
"a", "an", "and", "as", "at", "be", "by", "for", "from", "how", "i",
"in", "is", "it", "of", "on", "or", "than", "that", "the", "this",
"to", "was", "what", "when", "where", "which", "who", "why", "with",
"do", "does", "did", "are", "we", "you", "our", "my", "me",
})
def _tokens(s: str) -> list[str]:
return [t.lower() for t in _TOKEN_RE.findall(s) if t.lower() not in _STOPWORDS]
def _make_url(base: str, slug: str) -> str:
return f"{base.rstrip('/')}/{slug.lstrip('/')}"
def _snippet(content: str, terms: list[str], width: int) -> str:
if not content:
return ""
lc = content.lower()
first = -1
matched_term = None
for t in terms:
idx = lc.find(t)
if idx != -1 and (first == -1 or idx < first):
first = idx
matched_term = t
if first == -1:
return content[:width].strip() + ("" if len(content) > width else "")
start = max(0, first - width // 2)
end = min(len(content), start + width)
chunk = content[start:end].strip().replace("\n", " ")
if matched_term:
chunk = re.sub(
rf"(?i)\b({re.escape(matched_term)})\b",
r"**\1**",
chunk,
)
prefix = "" if start > 0 else ""
suffix = "" if end < len(content) else ""
return f"{prefix}{chunk}{suffix}"
class WikiBot(Plugin):
config: Config
_index: dict
_slug_titles: list[tuple[str, str]]
_last_refresh: float
_refresh_task: Optional[asyncio.Task]
@classmethod
def get_config_class(cls):
return Config
async def start(self) -> None:
await super().start()
self.config.load_and_update()
self._index = {}
self._slug_titles = []
self._last_refresh = 0.0
self._refresh_task = asyncio.create_task(self._refresh_loop())
async def stop(self) -> None:
if self._refresh_task:
self._refresh_task.cancel()
await super().stop()
async def on_external_config_update(self) -> None:
self.config.load_and_update()
async def _refresh_loop(self) -> None:
try:
while True:
try:
await self._refresh()
except Exception:
self.log.exception("wiki refresh failed; will retry")
await asyncio.sleep(self.config["refresh_minutes"] * 60)
except asyncio.CancelledError:
raise
async def _refresh(self) -> None:
url = _make_url(self.config["docs_url"], self.config["index_path"])
async with self.http.get(url) as resp:
resp.raise_for_status()
data = await resp.json(content_type=None)
new_index = {}
slug_titles = []
for slug, entry in data.items():
title = (entry.get("title") or slug).strip()
content = entry.get("content") or ""
new_index[slug] = {
"title": title,
"content": content,
"tags": entry.get("tags") or [],
}
slug_titles.append((slug, title.lower()))
self._index = new_index
self._slug_titles = slug_titles
self._last_refresh = time.time()
self.log.info("wiki refresh: %d docs from %s", len(new_index), url)
def _search(self, query: str, limit: int) -> list[tuple[float, str, dict]]:
terms = _tokens(query)
if not terms:
return []
hits = []
for slug, doc in self._index.items():
title_lc = doc["title"].lower()
content_lc = doc["content"].lower()
score = 0.0
for t in terms:
if t in title_lc:
score += 5.0
if t in content_lc:
score += 1.0 + 0.1 * content_lc.count(t)
if score > 0:
hits.append((score, slug, doc))
hits.sort(key=lambda x: x[0], reverse=True)
return hits[:limit]
def _lookup(self, query: str) -> tuple[Optional[str], Optional[dict]]:
q = query.strip().lower()
if not q:
return None, None
if q in self._index:
return q, self._index[q]
for slug in self._index:
if slug.lower().endswith("/" + q) or slug.lower() == q:
return slug, self._index[slug]
candidates = [t for _, t in self._slug_titles]
match = difflib.get_close_matches(q, candidates, n=1, cutoff=0.6)
if match:
for slug, title in self._slug_titles:
if title == match[0]:
return slug, self._index[slug]
return None, None
@command.passive(regex=_CMD_RE)
async def dispatch(self, evt: MessageEvent, match) -> None:
verb = match[1].lower()
body = (match[2] or "").strip()
if verb == "ask":
await self._handle_ask(evt, body)
elif verb == "doc":
await self._handle_doc(evt, body)
elif verb == "wiki":
await self._handle_wiki(evt, body)
async def _handle_ask(self, evt: MessageEvent, body: str) -> None:
if not body:
await evt.reply(
f"Usage: `!ask <question>` — search {self.config['site_name']}."
)
return
if not self._index:
await evt.reply("Wiki index isn't ready yet; try again in a moment.")
return
hits = self._search(body, self.config["max_results"])
if not hits:
await evt.reply(f"No matches in {self.config['site_name']} for that.")
return
terms = _tokens(body)
snippet_chars = self.config["snippet_chars"]
lines = [f"**{self.config['site_name']}{len(hits)} match(es):**"]
for score, slug, doc in hits:
url = _make_url(self.config["docs_url"], slug)
snip = _snippet(doc["content"], terms, snippet_chars)
lines.append(f"- **[{doc['title']}]({url})** — {snip}")
await evt.reply("\n".join(lines))
async def _handle_doc(self, evt: MessageEvent, body: str) -> None:
if not body:
await evt.reply("Usage: `!doc <slug-or-title>` — open a specific page.")
return
if not self._index:
await evt.reply("Wiki index isn't ready yet; try again in a moment.")
return
slug, doc = self._lookup(body)
if slug is None or doc is None:
await evt.reply(
f"No page matches `{body}`. Try `!ask {body}` for a fuzzy search."
)
return
url = _make_url(self.config["docs_url"], slug)
snippet_chars = self.config["snippet_chars"] * 2
preview = (doc["content"] or "").strip().replace("\n", " ")
if len(preview) > snippet_chars:
preview = preview[:snippet_chars].rstrip() + ""
await evt.reply(f"**[{doc['title']}]({url})**\n{preview}")
async def _handle_wiki(self, evt: MessageEvent, body: str) -> None:
body = body.strip().lower()
if body == "refresh":
try:
await self._refresh()
await evt.reply(f"🔄 Refreshed {len(self._index)} docs.")
except Exception as e:
await evt.reply(f"Refresh failed: {e}")
return
if body in ("", "status"):
age = int(time.time() - self._last_refresh) if self._last_refresh else None
age_str = f"{age}s ago" if age is not None else "never"
await evt.reply(
f"**{self.config['site_name']}** — {len(self._index)} docs, "
f"last refresh: {age_str}\n"
f"Source: {_make_url(self.config['docs_url'], self.config['index_path'])}\n"
f"Use `!ask <query>` or `!doc <slug-or-title>`."
)
return
await evt.reply("Usage: `!wiki` (status) or `!wiki refresh` (re-index).")