feat(wiki): docs-lookup plugin against Quartz contentIndex
New maubot plugin that points at any Quartz-rendered docs site and answers chat queries by full-text searching its emitted /static/contentIndex.json. Default config targets docs.ariege.io (castle-docs). Commands: !ask <query> search corpus; top-N hits with snippet + link !doc <slug-or-title> open a specific page (fuzzy title match) !wiki / !wiki refresh status / force re-index Architecture: - Periodic fetch (default 10 min) of /static/contentIndex.json - In-memory inverted-ish scoring: title hit 5pt, content hit 1pt + freq - No LLM — pure deterministic keyword search; RAG is future Phase 2b - No DB — index is upstream-derived cache, repopulates on bot restart Deployment posture: docs.ariege.io is served from cfaun alongside maubot, so the bot hits it over the host's internal network — works during WAN outages. base-config.yaml exposes docs_url + index_path for adopters pointing at their own site. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b7a096a77a
commit
8f83d8df5e
5 changed files with 382 additions and 0 deletions
234
wiki/wiki.py
Normal file
234
wiki/wiki.py
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
import asyncio
|
||||
import difflib
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from maubot import MessageEvent, Plugin
|
||||
from maubot.handlers import command
|
||||
from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper
|
||||
|
||||
|
||||
class Config(BaseProxyConfig):
|
||||
def do_update(self, helper: ConfigUpdateHelper) -> None:
|
||||
helper.copy("docs_url")
|
||||
helper.copy("index_path")
|
||||
helper.copy("refresh_minutes")
|
||||
helper.copy("max_results")
|
||||
helper.copy("snippet_chars")
|
||||
helper.copy("site_name")
|
||||
|
||||
|
||||
_CMD_RE = re.compile(r"^!(ask|doc|wiki)(?:[ \t\r\n]+(.*))?$", re.DOTALL)
|
||||
_TOKEN_RE = re.compile(r"[a-z0-9]+", re.IGNORECASE)
|
||||
_STOPWORDS = frozenset({
|
||||
"a", "an", "and", "as", "at", "be", "by", "for", "from", "how", "i",
|
||||
"in", "is", "it", "of", "on", "or", "than", "that", "the", "this",
|
||||
"to", "was", "what", "when", "where", "which", "who", "why", "with",
|
||||
"do", "does", "did", "are", "we", "you", "our", "my", "me",
|
||||
})
|
||||
|
||||
|
||||
def _tokens(s: str) -> list[str]:
|
||||
return [t.lower() for t in _TOKEN_RE.findall(s) if t.lower() not in _STOPWORDS]
|
||||
|
||||
|
||||
def _make_url(base: str, slug: str) -> str:
|
||||
return f"{base.rstrip('/')}/{slug.lstrip('/')}"
|
||||
|
||||
|
||||
def _snippet(content: str, terms: list[str], width: int) -> str:
|
||||
if not content:
|
||||
return ""
|
||||
lc = content.lower()
|
||||
first = -1
|
||||
matched_term = None
|
||||
for t in terms:
|
||||
idx = lc.find(t)
|
||||
if idx != -1 and (first == -1 or idx < first):
|
||||
first = idx
|
||||
matched_term = t
|
||||
if first == -1:
|
||||
return content[:width].strip() + ("…" if len(content) > width else "")
|
||||
start = max(0, first - width // 2)
|
||||
end = min(len(content), start + width)
|
||||
chunk = content[start:end].strip().replace("\n", " ")
|
||||
if matched_term:
|
||||
chunk = re.sub(
|
||||
rf"(?i)\b({re.escape(matched_term)})\b",
|
||||
r"**\1**",
|
||||
chunk,
|
||||
)
|
||||
prefix = "…" if start > 0 else ""
|
||||
suffix = "…" if end < len(content) else ""
|
||||
return f"{prefix}{chunk}{suffix}"
|
||||
|
||||
|
||||
class WikiBot(Plugin):
|
||||
config: Config
|
||||
_index: dict
|
||||
_slug_titles: list[tuple[str, str]]
|
||||
_last_refresh: float
|
||||
_refresh_task: Optional[asyncio.Task]
|
||||
|
||||
@classmethod
|
||||
def get_config_class(cls):
|
||||
return Config
|
||||
|
||||
async def start(self) -> None:
|
||||
await super().start()
|
||||
self.config.load_and_update()
|
||||
self._index = {}
|
||||
self._slug_titles = []
|
||||
self._last_refresh = 0.0
|
||||
self._refresh_task = asyncio.create_task(self._refresh_loop())
|
||||
|
||||
async def stop(self) -> None:
|
||||
if self._refresh_task:
|
||||
self._refresh_task.cancel()
|
||||
await super().stop()
|
||||
|
||||
async def on_external_config_update(self) -> None:
|
||||
self.config.load_and_update()
|
||||
|
||||
async def _refresh_loop(self) -> None:
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
await self._refresh()
|
||||
except Exception:
|
||||
self.log.exception("wiki refresh failed; will retry")
|
||||
await asyncio.sleep(self.config["refresh_minutes"] * 60)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
|
||||
async def _refresh(self) -> None:
|
||||
url = _make_url(self.config["docs_url"], self.config["index_path"])
|
||||
async with self.http.get(url) as resp:
|
||||
resp.raise_for_status()
|
||||
data = await resp.json(content_type=None)
|
||||
new_index = {}
|
||||
slug_titles = []
|
||||
for slug, entry in data.items():
|
||||
title = (entry.get("title") or slug).strip()
|
||||
content = entry.get("content") or ""
|
||||
new_index[slug] = {
|
||||
"title": title,
|
||||
"content": content,
|
||||
"tags": entry.get("tags") or [],
|
||||
}
|
||||
slug_titles.append((slug, title.lower()))
|
||||
self._index = new_index
|
||||
self._slug_titles = slug_titles
|
||||
self._last_refresh = time.time()
|
||||
self.log.info("wiki refresh: %d docs from %s", len(new_index), url)
|
||||
|
||||
def _search(self, query: str, limit: int) -> list[tuple[float, str, dict]]:
|
||||
terms = _tokens(query)
|
||||
if not terms:
|
||||
return []
|
||||
hits = []
|
||||
for slug, doc in self._index.items():
|
||||
title_lc = doc["title"].lower()
|
||||
content_lc = doc["content"].lower()
|
||||
score = 0.0
|
||||
for t in terms:
|
||||
if t in title_lc:
|
||||
score += 5.0
|
||||
if t in content_lc:
|
||||
score += 1.0 + 0.1 * content_lc.count(t)
|
||||
if score > 0:
|
||||
hits.append((score, slug, doc))
|
||||
hits.sort(key=lambda x: x[0], reverse=True)
|
||||
return hits[:limit]
|
||||
|
||||
def _lookup(self, query: str) -> tuple[Optional[str], Optional[dict]]:
|
||||
q = query.strip().lower()
|
||||
if not q:
|
||||
return None, None
|
||||
if q in self._index:
|
||||
return q, self._index[q]
|
||||
for slug in self._index:
|
||||
if slug.lower().endswith("/" + q) or slug.lower() == q:
|
||||
return slug, self._index[slug]
|
||||
candidates = [t for _, t in self._slug_titles]
|
||||
match = difflib.get_close_matches(q, candidates, n=1, cutoff=0.6)
|
||||
if match:
|
||||
for slug, title in self._slug_titles:
|
||||
if title == match[0]:
|
||||
return slug, self._index[slug]
|
||||
return None, None
|
||||
|
||||
@command.passive(regex=_CMD_RE)
|
||||
async def dispatch(self, evt: MessageEvent, match) -> None:
|
||||
verb = match[1].lower()
|
||||
body = (match[2] or "").strip()
|
||||
if verb == "ask":
|
||||
await self._handle_ask(evt, body)
|
||||
elif verb == "doc":
|
||||
await self._handle_doc(evt, body)
|
||||
elif verb == "wiki":
|
||||
await self._handle_wiki(evt, body)
|
||||
|
||||
async def _handle_ask(self, evt: MessageEvent, body: str) -> None:
|
||||
if not body:
|
||||
await evt.reply(
|
||||
f"Usage: `!ask <question>` — search {self.config['site_name']}."
|
||||
)
|
||||
return
|
||||
if not self._index:
|
||||
await evt.reply("Wiki index isn't ready yet; try again in a moment.")
|
||||
return
|
||||
hits = self._search(body, self.config["max_results"])
|
||||
if not hits:
|
||||
await evt.reply(f"No matches in {self.config['site_name']} for that.")
|
||||
return
|
||||
terms = _tokens(body)
|
||||
snippet_chars = self.config["snippet_chars"]
|
||||
lines = [f"**{self.config['site_name']} — {len(hits)} match(es):**"]
|
||||
for score, slug, doc in hits:
|
||||
url = _make_url(self.config["docs_url"], slug)
|
||||
snip = _snippet(doc["content"], terms, snippet_chars)
|
||||
lines.append(f"- **[{doc['title']}]({url})** — {snip}")
|
||||
await evt.reply("\n".join(lines))
|
||||
|
||||
async def _handle_doc(self, evt: MessageEvent, body: str) -> None:
|
||||
if not body:
|
||||
await evt.reply("Usage: `!doc <slug-or-title>` — open a specific page.")
|
||||
return
|
||||
if not self._index:
|
||||
await evt.reply("Wiki index isn't ready yet; try again in a moment.")
|
||||
return
|
||||
slug, doc = self._lookup(body)
|
||||
if slug is None or doc is None:
|
||||
await evt.reply(
|
||||
f"No page matches `{body}`. Try `!ask {body}` for a fuzzy search."
|
||||
)
|
||||
return
|
||||
url = _make_url(self.config["docs_url"], slug)
|
||||
snippet_chars = self.config["snippet_chars"] * 2
|
||||
preview = (doc["content"] or "").strip().replace("\n", " ")
|
||||
if len(preview) > snippet_chars:
|
||||
preview = preview[:snippet_chars].rstrip() + "…"
|
||||
await evt.reply(f"**[{doc['title']}]({url})**\n{preview}")
|
||||
|
||||
async def _handle_wiki(self, evt: MessageEvent, body: str) -> None:
|
||||
body = body.strip().lower()
|
||||
if body == "refresh":
|
||||
try:
|
||||
await self._refresh()
|
||||
await evt.reply(f"🔄 Refreshed {len(self._index)} docs.")
|
||||
except Exception as e:
|
||||
await evt.reply(f"Refresh failed: {e}")
|
||||
return
|
||||
if body in ("", "status"):
|
||||
age = int(time.time() - self._last_refresh) if self._last_refresh else None
|
||||
age_str = f"{age}s ago" if age is not None else "never"
|
||||
await evt.reply(
|
||||
f"**{self.config['site_name']}** — {len(self._index)} docs, "
|
||||
f"last refresh: {age_str}\n"
|
||||
f"Source: {_make_url(self.config['docs_url'], self.config['index_path'])}\n"
|
||||
f"Use `!ask <query>` or `!doc <slug-or-title>`."
|
||||
)
|
||||
return
|
||||
await evt.reply("Usage: `!wiki` (status) or `!wiki refresh` (re-index).")
|
||||
Loading…
Add table
Add a link
Reference in a new issue