Complete project rework into mirage client/server without web

This commit is contained in:
2025-12-02 05:00:12 -05:00
parent 435f91539b
commit eeeae4740b
29 changed files with 1172 additions and 2059 deletions

5
.gitignore vendored
View File

@@ -1,8 +1,3 @@
index.html*
/mirrors
/node_modules
/data
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]

View File

@@ -1,29 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
BASE="/srv/www"
URL_LIST="$BASE/mirrors.txt"
if [ $# -lt 1 ]; then
echo "Usage: $0 URL [slug]"
exit 1
fi
url="$1"
if [ $# -ge 2 ]; then
slug="$2"
else
# crude slugify: strip scheme, replace non alnum with underscores
slug="$(echo "$url" | sed 's#https\?://##; s#[^a-zA-Z0-9._-]#_#g')"
fi
# Check if URL already exists
if grep -q " $url\$" "$URL_LIST" 2>/dev/null; then
echo "URL already in list. Not adding again."
else
echo "$slug $url" >> "$URL_LIST"
echo "Added: $slug $url"
fi
# Run update for just this slug
"$BASE/update_mirrors.sh" "$slug"

639
app.py
View File

@@ -1,639 +0,0 @@
#!/usr/bin/env python3
from mirror_manager import (
load_mirrors,
add_mirror,
update_mirror,
MIRROR_ROOT,
LOG_ROOT,
)
import re
import html
import subprocess
import threading
from pathlib import Path
from flask import (
Flask,
request,
redirect,
url_for,
jsonify,
send_from_directory,
render_template_string
)
BASE = Path("/srv/www")
STATIC_DIR = BASE / "static"
STATIC_DIR.mkdir(exist_ok=True)
app = Flask(__name__)
def _run_update_in_background(slug: str):
th = threading.Thread(target=update_mirror, args=(slug,), daemon=True)
th.start()
# -------------------- TEMPLATES --------------------
INDEX_TEMPLATE = r"""
<!doctype html>
<html class="h-full">
<head>
<meta charset="utf-8">
<title>Mirror Manager</title>
<link rel="stylesheet" href="{{ url_for('static_file', filename='tailwind.css') }}">
</head>
<body class="h-full bg-slate-950 text-slate-100">
<div class="min-h-full">
<header class="border-b border-slate-800 bg-slate-950/80 backdrop-blur">
<div class="max-w-5xl mx-auto px-4 py-4 flex flex-col sm:flex-row sm:items-center sm:justify-between gap-2">
<div>
<h1 class="text-xl font-semibold tracking-tight">Mirror Manager</h1>
<p class="text-xs text-slate-400">Local offline mirrors of external sites, grouped by category.</p>
</div>
<div class="flex items-center gap-2 text-xs text-slate-400">
<span class="inline-flex items-center gap-1 px-2 py-1 rounded-full border border-slate-700 bg-slate-900/70">
<span class="w-2 h-2 rounded-full bg-emerald-400"></span>
Running locally
</span>
</div>
</div>
</header>
<main class="max-w-5xl mx-auto px-4 py-4 space-y-4">
<!-- Mirrors list -->
<section class="bg-slate-950/80 border border-slate-800 rounded-2xl p-4 shadow-xl shadow-black/40">
<div class="flex flex-col md:flex-row md:items-center md:justify-between gap-3 mb-3">
<div class="flex flex-wrap items-center gap-2">
<span class="text-xs text-slate-400">Categories:</span>
<button class="px-2.5 py-1 rounded-full text-xs border bg-slate-900 border-slate-700 text-slate-100 hover:border-sky-500 cat-pill cat-pill-active" data-category="all">
All ({{ mirrors|length }})
</button>
{% for cat in categories %}
<button class="px-2.5 py-1 rounded-full text-xs border bg-slate-900 border-slate-800 text-slate-400 hover:border-sky-500 hover:text-slate-100 cat-pill" data-category="{{ cat }}">
{{ cat }}
</button>
{% endfor %}
</div>
<div class="flex gap-2">
<input
id="search"
class="w-full md:w-64 rounded-full bg-slate-900 border border-slate-700 px-3 py-1.5 text-sm text-slate-100 placeholder:text-slate-500 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:border-sky-500"
placeholder="Filter by slug / URL / category…"
/>
</div>
</div>
<div class="overflow-x-auto border border-slate-800 rounded-xl">
<table class="min-w-full text-sm">
<thead class="bg-slate-900/70 text-xs uppercase text-slate-400">
<tr>
<th class="px-3 py-2 text-left whitespace-nowrap">Slug</th>
<th class="px-3 py-2 text-left whitespace-nowrap">Categories</th>
<th class="px-3 py-2 text-left whitespace-nowrap">URL</th>
<th class="px-3 py-2 text-left whitespace-nowrap">Last updated</th>
<th class="px-3 py-2 text-left whitespace-nowrap">Status</th>
<th class="px-3 py-2 text-left"></th>
</tr>
</thead>
<tbody id="mirror-table" class="divide-y divide-slate-900/80">
{% for m in mirrors %}
<tr class="hover:bg-slate-900/80 transition" data-slug="{{ m.slug }}" data-categories="{{ m.categories_joined }}" data-search="{{ (m.slug ~ ' ' ~ m.categories_joined ~ ' ' ~ m.url)|lower }}">
<td class="px-3 py-2 align-top">
<div class="flex flex-col gap-1">
<a href="/mirrors/{{ m.slug }}/" target="_blank" class="font-mono text-xs text-sky-400 hover:text-sky-300 break-all">
{{ m.slug }}
</a>
<a href="{{ url_for('log_view', slug=m.slug) }}" target="_blank" class="text-[0.65rem] text-slate-400 hover:text-slate-200">
View live log
</a>
</div>
</td>
<td class="px-3 py-2 align-top">
<div class="flex flex-wrap gap-1">
{% for c in m.categories %}
<span class="px-1.5 py-0.5 rounded-full text-[0.65rem] bg-slate-800/80 text-slate-300 border border-slate-700">{{ c }}</span>
{% endfor %}
</div>
</td>
<td class="px-3 py-2 align-top max-w-xs">
<code class="font-mono text-[0.7rem] text-slate-300 break-all">{{ m.url }}</code>
</td>
<td class="px-3 py-2 align-top text-xs text-slate-300">
{% if m.last_updated %}
<span title="{{ m.last_updated_raw }}">{{ m.last_updated }}</span>
{% else %}
<span class="text-slate-600">never</span>
{% endif %}
</td>
<td class="px-3 py-2 align-top text-xs">
{% set st = m.status or 'idle' %}
<div class="inline-flex items-center gap-1.5 px-2 py-0.5 rounded-full bg-slate-900 border border-slate-800">
<span class="w-2 h-2 rounded-full
{% if st == 'idle' %}bg-emerald-400{% elif st == 'updating' %}bg-amber-400 animate-pulse{% elif st == 'warning' %}bg-yellow-400{% else %}bg-rose-400{% endif %}"></span>
<span class="capitalize">{{ st }}</span>
</div>
</td>
<td class="px-3 py-2 align-top text-right text-[0.7rem]">
<form method="post" action="{{ url_for('trigger_update', slug=m.slug) }}" class="inline">
<button class="inline-flex items-center gap-1 px-2 py-1 rounded-full border border-slate-700 text-slate-200 hover:border-sky-500 hover:text-sky-100">
<span>Update</span>
</button>
</form>
</td>
</tr>
{% endfor %}
{% if mirrors|length == 0 %}
<tr>
<td colspan="6" class="px-3 py-6 text-center text-sm text-slate-500">
No mirrors yet. Add one below.
</td>
</tr>
{% endif %}
</tbody>
</table>
</div>
</section>
<!-- Add mirror -->
<section class="bg-slate-950/80 border border-slate-800 rounded-2xl p-4 shadow-xl shadow-black/40 space-y-3">
<h2 class="text-sm font-semibold">Add mirror</h2>
<form method="post" action="{{ url_for('add_mirror_route') }}" class="space-y-3">
<div>
<label for="slug" class="block text-xs font-medium text-slate-300 mb-1">Slug</label>
<input id="slug" name="slug" required class="w-full rounded-lg bg-slate-900 border border-slate-700 px-2.5 py-1.5 text-sm text-slate-100 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:border-sky-500 font-mono" placeholder="e.g. wgpu-tutorial" />
</div>
<div>
<label for="categories" class="block text-xs font-medium text-slate-300 mb-1">Categories</label>
<input id="categories" name="categories" required class="w-full rounded-lg bg-slate-900 border border-slate-700 px-2.5 py-1.5 text-sm text-slate-100 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:border-sky-500" placeholder="e.g. tutorials, graphics, rust" />
</div>
<div>
<label for="url" class="block text-xs font-medium text-slate-300 mb-1">URL</label>
<input id="url" name="url" required class="w-full rounded-lg bg-slate-900 border border-slate-700 px-2.5 py-1.5 text-sm text-slate-100 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:border-sky-500" placeholder="https://example.com/some/path/" />
</div>
<div class="flex items-start gap-2">
<input id="ignore_robots" name="ignore_robots" value="1" type="checkbox" class="mt-0.5 rounded border-slate-600 bg-slate-900 text-sky-500 focus:ring-sky-500" />
<label for="ignore_robots" class="text-xs text-slate-400">
Ignore robots.txt (only if you explicitly want to archive disallowed paths).
</label>
</div>
{% if error %}
<p class="text-xs text-rose-300 bg-rose-950/60 border border-rose-900 rounded-lg px-2 py-1">{{ error }}</p>
{% endif %}
<button type="submit" class="w-full inline-flex items-center justify-center gap-1.5 rounded-full bg-gradient-to-r from-sky-500 to-indigo-500 px-3 py-2 text-xs font-medium text-white hover:from-sky-400 hover:to-indigo-400">
Add &amp; mirror
</button>
<p class="text-[0.7rem] text-slate-500">
New mirrors are cloned in the background. Status will show as <span class="text-amber-300">updating</span> until done.
</p>
</form>
</section>
<!-- Content search -->
<section class="bg-slate-950/80 border border-slate-800 rounded-2xl p-4 shadow-xl shadow-black/40">
<h2 class="text-sm font-semibold mb-2">Content search</h2>
<form id="search-form" class="space-y-2">
<input id="content-query" class="w-full rounded-lg bg-slate-900 border border-slate-700 px-2.5 py-1.5 text-sm text-slate-100 focus:outline-none focus:ring-2 focus:ring-sky-500 focus:border-sky-500" placeholder="Search text across all mirrors (using rg)…" />
<button type="submit" class="w-full inline-flex items-center justify-center gap-1.5 rounded-full border border-slate-700 bg-slate-900 px-3 py-2 text-xs font-medium text-slate-100 hover:border-sky-500 hover:text-sky-100">
Run ripgrep search
</button>
</form>
<div id="search-results" class="mt-2 max-h-64 overflow-y-auto text-[0.7rem] space-y-1 text-slate-300"></div>
</section>
</main>
</div>
<script>
// Category + name filter
const pills = Array.from(document.querySelectorAll('.cat-pill'));
const rows = Array.from(document.querySelectorAll('#mirror-table tr[data-slug]'));
const searchInput = document.getElementById('search');
function applyFilters() {
const active = pills.find(p => p.classList.contains('cat-pill-active'));
const cat = active ? active.dataset.category : 'all';
const q = (searchInput.value || '').toLowerCase();
rows.forEach(row => {
const cats = row.dataset.categories.split(',').map(s => s.trim());
const searchStr = row.dataset.search;
const matchesCat = (cat === 'all' || cats.includes(cat));
const matchesSearch = (!q || searchStr.includes(q));
row.style.display = (matchesCat && matchesSearch) ? '' : 'none';
});
}
pills.forEach(p => {
p.addEventListener('click', () => {
pills.forEach(x => x.classList.remove('cat-pill-active', 'border-sky-500', 'text-slate-100'));
p.classList.add('cat-pill-active', 'border-sky-500', 'text-slate-100');
applyFilters();
});
});
searchInput.addEventListener('input', applyFilters);
// Live status polling
async function pollStatus() {
try {
const resp = await fetch("{{ url_for('status') }}");
if (!resp.ok) return;
const data = await resp.json();
const bySlug = {};
data.mirrors.forEach(m => bySlug[m.slug] = m);
rows.forEach(row => {
const slug = row.dataset.slug;
const m = bySlug[slug];
if (!m) return;
const tds = row.querySelectorAll('td');
const lastCell = tds[3];
const statusCell = tds[4];
lastCell.innerHTML = m.last_updated_display || '<span class="text-slate-600">never</span>';
const st = m.status || 'idle';
statusCell.innerHTML =
'<div class="inline-flex items-center gap-1.5 px-2 py-0.5 rounded-full bg-slate-900 border border-slate-800">' +
'<span class="w-2 h-2 rounded-full ' +
(st === "idle" ? "bg-emerald-400" :
st === "updating" ? "bg-amber-400 animate-pulse" :
st === "warning" ? "bg-yellow-400" : "bg-rose-400") +
'"></span>' +
'<span class="capitalize">' + st + '</span>' +
'</div>';
});
} catch (e) {}
}
setInterval(pollStatus, 5000);
// Content search via rg
const searchForm = document.getElementById('search-form');
const contentQuery = document.getElementById('content-query');
const searchResults = document.getElementById('search-results');
searchForm.addEventListener('submit', async (e) => {
e.preventDefault();
const q = contentQuery.value.trim();
if (!q) return;
searchResults.textContent = 'Searching…';
try {
const resp = await fetch("{{ url_for('content_search') }}?q=" + encodeURIComponent(q));
if (!resp.ok) {
searchResults.textContent = 'Search failed.';
return;
}
const data = await resp.json();
if (data.results.length === 0) {
searchResults.textContent = 'No matches.';
return;
}
searchResults.innerHTML = '';
data.results.forEach(r => {
const wrapper = document.createElement('div');
wrapper.className = "border border-slate-800 rounded-lg px-2 py-1 bg-slate-900/70";
const pathLine = document.createElement('div');
pathLine.className = "font-mono text-[0.65rem] text-sky-300 break-all";
if (r.url) {
const link = document.createElement('a');
link.href = r.url;
link.target = "_blank";
link.rel = "noopener noreferrer";
link.textContent = r.path + (r.line ? `:${r.line}` : "");
pathLine.appendChild(link);
} else {
pathLine.textContent = r.path + (r.line ? `:${r.line}` : "");
}
const snippetLine = document.createElement('div');
snippetLine.className = "text-[0.7rem] text-slate-200 whitespace-pre-wrap";
snippetLine.textContent = r.snippet || "";
wrapper.appendChild(pathLine);
wrapper.appendChild(snippetLine);
searchResults.appendChild(wrapper);
});
} catch (e) {
searchResults.textContent = 'Search failed.';
}
});
</script>
</body>
</html>
"""
LOG_TEMPLATE = r"""
<!doctype html>
<html class="h-full">
<head>
<meta charset="utf-8">
<title>Log: {{ slug }}</title>
<link rel="stylesheet" href="{{ url_for('static_file', filename='tailwind.css') }}">
</head>
<body class="h-full bg-slate-950 text-slate-100">
<div class="max-w-5xl mx-auto px-4 py-4 space-y-2">
<div class="flex items-center justify-between mb-2">
<div>
<h1 class="text-sm font-semibold">Log for <span class="font-mono text-sky-400">{{ slug }}</span></h1>
<p class="text-[0.65rem] text-slate-400">Live tail of wget output (auto-refreshing).</p>
</div>
<a href="/mirrors/{{ slug }}/" target="_blank" class="text-xs text-sky-400 hover:text-sky-200">Open mirror</a>
</div>
<div class="border border-slate-800 rounded-xl bg-slate-950/90 max-h-[75vh] overflow-y-auto">
<pre id="log" class="text-[0.65rem] p-3 font-mono whitespace-pre-wrap"></pre>
</div>
</div>
<script>
const logEl = document.getElementById('log');
async function pollLog() {
try {
const resp = await fetch("{{ url_for('log_tail', slug=slug) }}");
if (!resp.ok) return;
const text = await resp.text();
logEl.textContent = text;
logEl.parentElement.scrollTop = logEl.parentElement.scrollHeight;
} catch (e) {}
}
setInterval(pollLog, 1500);
pollLog();
</script>
</body>
</html>
"""
# -------------------- ROUTES --------------------
@app.route("/static/<path:filename>")
def static_file(filename):
return send_from_directory(STATIC_DIR, filename)
@app.route("/", methods=["GET"])
def index():
mirrors = load_mirrors()
cats = set()
rows = []
for m in mirrors:
categories = m.get("categories") or []
for c in categories:
cats.add(c)
raw = m.get("last_updated")
disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None
rows.append({
"slug": m["slug"],
"categories": categories,
"categories_joined": ", ".join(categories),
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": disp,
})
return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=sorted(cats), error=None)
@app.route("/add", methods=["POST"])
def add_mirror_route():
slug = (request.form.get("slug") or "").strip()
categories = (request.form.get("categories") or "").strip()
url = (request.form.get("url") or "").strip()
ignore_robots = bool(request.form.get("ignore_robots"))
error = None
if not slug or not categories or not url:
error = "Slug, categories, and URL are required."
elif " " in slug:
error = "Slug cannot contain spaces."
if error:
# re-render with error
mirrors = load_mirrors()
cats = set()
rows = []
for m in mirrors:
cs = m.get("categories") or []
for c in cs:
cats.add(c)
raw = m.get("last_updated")
disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None
rows.append({
"slug": m["slug"],
"categories": cs,
"categories_joined": ", ".join(cs),
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": disp,
})
return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=sorted(cats), error=error), 400
try:
add_mirror(slug, categories, url, ignore_robots=ignore_robots)
except Exception as e:
mirrors = load_mirrors()
cats = set()
rows = []
for m in mirrors:
cs = m.get("categories") or []
for c in cs:
cats.add(c)
raw = m.get("last_updated")
disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None
rows.append({
"slug": m["slug"],
"categories": cs,
"categories_joined": ", ".join(cs),
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": disp,
})
return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=sorted(cats), error=str(e)), 400
_run_update_in_background(slug)
return redirect(url_for("index"))
@app.route("/update/<slug>", methods=["POST"])
def trigger_update(slug):
_run_update_in_background(slug)
return redirect(url_for("index"))
@app.route("/status", methods=["GET"])
def status():
mirrors = load_mirrors()
out = []
for m in mirrors:
raw = m.get("last_updated")
disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None
out.append({
"slug": m["slug"],
"categories": m.get("categories") or [],
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated": raw,
"last_updated_display": disp or "",
})
return jsonify({"mirrors": out})
@app.route("/logs/<slug>")
def log_view(slug):
log_path = LOG_ROOT / f"{slug}.log"
if not log_path.exists():
log_path.touch()
return render_template_string(LOG_TEMPLATE, slug=slug)
@app.route("/logs/<slug>/tail")
def log_tail(slug):
log_path = LOG_ROOT / f"{slug}.log"
if not log_path.exists():
return "", 200
try:
with log_path.open("rb") as f:
f.seek(0, 2)
size = f.tell()
block = 65536
if size <= block:
f.seek(0)
data = f.read()
else:
f.seek(-block, 2)
data = f.read()
return data.decode("utf-8", errors="replace")
except OSError:
return "", 200
def strip_html(text: str) -> str:
# Remove script and style blocks first
text = re.sub(
r"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>",
" ",
text,
flags=re.IGNORECASE,
)
text = re.sub(
r"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>",
" ",
text,
flags=re.IGNORECASE,
)
# Strip all remaining tags
text = re.sub(r"<[^>]+>", " ", text)
# Unescape HTML entities (&amp; → &, etc.)
text = html.unescape(text)
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
def make_snippet(text: str,
query: str,
radius: int = 80,
max_len: int = 240) -> str:
if not text:
return ""
lower = text.lower()
qlower = query.lower()
idx = lower.find(qlower)
if idx == -1:
snippet = text[:max_len]
if len(text) > max_len:
snippet += ""
return snippet
start = max(0, idx - radius)
end = min(len(text), idx + len(query) + radius)
snippet = text[start:end]
if start > 0:
snippet = "" + snippet
if end < len(text):
snippet += ""
return snippet
@app.route("/search", methods=["GET"])
def content_search():
q = (request.args.get("q") or "").strip()
if not q:
return jsonify({"results": []})
try:
proc = subprocess.run(
[
"rg",
"--line-number",
"--no-heading",
"--color", "never",
"--max-count", "5", # per file
"--type-add", "page:*.{html,htm,md,markdown,txt}",
"-tpage",
q,
str(MIRROR_ROOT),
],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
timeout=10,
)
except FileNotFoundError:
return jsonify({
"results": [{
"path": "(error)",
"line": 0,
"url": "",
"snippet": "ripgrep (rg) is not installed."
}]
})
except subprocess.TimeoutExpired:
return jsonify({
"results": [{
"path": "(error)",
"line": 0,
"url": "",
"snippet": "rg timed out."
}]
})
results = []
for line in proc.stdout.splitlines():
parts = line.split(":", 2)
if len(parts) != 3:
continue
path, lineno, raw_content = parts
# Strip HTML/JS/CSS markup from this line before making a snippet
text_content = strip_html(raw_content)
if not text_content:
continue
snippet = make_snippet(text_content, q)
try:
rel_path = str(Path(path).relative_to(MIRROR_ROOT))
except ValueError:
rel_path = path
url = "/mirrors/" + rel_path.replace("\\", "/")
results.append({
"path": rel_path,
"line": int(lineno),
"url": url,
"snippet": snippet,
})
if len(results) >= 50:
break
return jsonify({"results": results})
if __name__ == "__main__":
app.run(host="127.0.0.1", port=5000, debug=False)

View File

@@ -1,61 +0,0 @@
#!/usr/bin/env python3
import pathlib
import html
BASE = pathlib.Path("/srv/www")
URL_LIST = BASE / "mirrors.txt"
OUTDIR = BASE / "mirrors"
INDEX = BASE / "index.html"
entries = []
if URL_LIST.exists():
for line in URL_LIST.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(None, 1)
if len(parts) != 2:
continue
slug, url = parts
mirror_dir = OUTDIR / slug
if not mirror_dir.exists():
# not mirrored yet, but still list it
status = " (not downloaded yet)"
else:
status = ""
entries.append((slug, url, status))
items_html = []
for slug, url, status in entries:
slug_esc = html.escape(slug)
url_esc = html.escape(url)
status_esc = html.escape(status)
# Link goes to the directory; nginx autoindex or an index file will handle it
items_html.append(
f'<li><a href="mirrors/{slug_esc}/">{slug_esc}</a>'
f' <code>{url_esc}</code>{status_esc}</li>'
)
html_doc = f"""<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>My Tutorial Mirrors</title>
<style>
body {{ font-family: sans-serif; max-width: 800px; margin: 2rem auto; }}
h1 {{ margin-bottom: 0.5rem; }}
code {{ font-size: 0.9em; }}
</style>
</head>
<body>
<h1>Nytegear Mirrors</h1>
<p>This page is generated automatically from <code>mirrors.txt</code>.</p>
<ul>
{''.join(items_html)}
</ul>
</body>
</html>
"""
INDEX.write_text(html_doc, encoding="utf-8")

3
mirage/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""Mirage core package."""
__all__ = []

44
mirage/cli.py Normal file
View File

@@ -0,0 +1,44 @@
from __future__ import annotations
import typer
from .commands import mirrors_app, misc_app
from .daemon import run_daemon
app = typer.Typer(
help=(
"Mirage - mirror management that's too good to be true.\n\n"
"Manage local mirrors of websites for offline use.\n"
"Use `mirage mirrors ...` to add/list/update/search.\n"
"Run `mirage daemon` (e.g. under systemd) to process update jobs."
)
)
app.add_typer(mirrors_app, name="mirrors")
app.add_typer(misc_app, name="misc")
@app.command("daemon")
def daemon_cmd(
poll_interval: float = typer.Option(
1.0,
"--poll-interval",
help="Seconds between job queue polls.",
),
):
"""
Run the Mirage mirror daemon.
This is intended to be managed by systemd:
- `ExecStart=/usr/bin/mirage daemon`
"""
run_daemon(poll_interval=poll_interval)
def app_main():
app()
if __name__ == "__main__":
app_main()

View File

@@ -0,0 +1,4 @@
from .mirrors import mirrors_app
from .misc import misc_app
__all__ = ["mirrors_app", "misc_app"]

300
mirage/commands/mirrors.py Normal file
View File

@@ -0,0 +1,300 @@
from __future__ import annotations
import os
import time
from typing import List, Optional
import typer
from .. import storage, jobs
from ..models import Mirror
from ..updater import log_path_for
mirrors_app = typer.Typer(
help="Manage mirrors (add, list, update, search, status, watch).")
@mirrors_app.command("list")
def list_mirrors_cmd():
"""
List all configured mirrors.
"""
mirrors = storage.list_mirrors()
if not mirrors:
typer.echo("No mirrors configured.")
raise typer.Exit(0)
for m in mirrors:
cats = ", ".join(m.categories) if m.categories else "-"
status = m.status or "idle"
lu = m.last_updated.isoformat(
sep=" ", timespec="seconds") if m.last_updated else "never"
typer.echo(f"{m.slug:20} [{status:8}] {cats:25} {lu}")
typer.echo(f" {m.url}")
@mirrors_app.command("add")
def add_mirror_cmd(
slug: str = typer.Argument(...,
help="Local slug for the mirror (unique)."),
url: str = typer.Argument(..., help="Source URL to mirror."),
category: List[str] = typer.Option(
None,
"--category",
"-c",
help="Category tag(s) to apply. Can be passed multiple times.",
),
ignore_robots: bool = typer.Option(
False,
"--ignore-robots",
help="Ignore robots.txt when mirroring (wget robots=off).",
),
no_update: bool = typer.Option(
False,
"--no-update",
help="Do not enqueue an initial update job.",
),
):
"""
Add a new mirror definition.
By default, this queues an initial update job and returns immediately.
The actual mirroring is handled by the mirage daemon.
"""
existing = storage.get_mirror(slug)
if existing:
typer.echo(f"Error: mirror with slug {
slug!r} already exists.", err=True)
raise typer.Exit(1)
m = Mirror(
slug=slug,
url=url,
categories=category or [],
ignore_robots=ignore_robots,
)
storage.upsert_mirror(m)
typer.echo(f"Added mirror {slug!r} -> {url}")
if no_update:
typer.echo("Initial update NOT queued (per --no-update).")
return
jobs.enqueue_update(slug)
typer.echo("Initial update job queued.")
typer.echo("Run `mirage mirrors status` or `mirage mirrors watch` to monitor.")
@mirrors_app.command("edit")
def edit_mirror_cmd(
slug: str = typer.Argument(..., help="Mirror slug to edit."),
new_slug: Optional[str] = typer.Option(
None,
"--slug",
help="Rename the mirror to this slug.",
),
url: Optional[str] = typer.Option(
None,
"--url",
help="Update the source URL.",
),
category: List[str] = typer.Option(
None,
"--category",
"-c",
help="Replace categories with these (can be passed multiple times).",
),
add_category: List[str] = typer.Option(
None,
"--add-category",
help="Add category tag(s) without removing existing ones.",
),
remove_category: List[str] = typer.Option(
None,
"--remove-category",
help="Remove these category tag(s) from the mirror.",
),
ignore_robots: Optional[bool] = typer.Option(
None,
"--ignore-robots/--respect-robots",
help="Toggle ignoring robots.txt.",
),
):
"""
Modify properties of an existing mirror (URL, categories, ignore_robots, slug).
"""
m = storage.get_mirror(slug)
if not m:
typer.echo(f"No such mirror: {slug!r}", err=True)
raise typer.Exit(1)
original_slug = m.slug
if url is not None:
m.url = url
if category:
m.categories = list(category)
if add_category:
for c in add_category:
if c not in m.categories:
m.categories.append(c)
if remove_category:
m.categories = [c for c in m.categories if c not in remove_category]
if ignore_robots is not None:
m.ignore_robots = ignore_robots
if new_slug is not None and new_slug != original_slug:
# Simple rename: remove old entry, reinsert with new slug
m.slug = new_slug
# Save under new slug
storage.upsert_mirror(m)
# Delete old slug
if original_slug != new_slug:
storage.delete_mirror(original_slug)
typer.echo(f"Mirror {original_slug!r} renamed to {new_slug!r}.")
else:
storage.upsert_mirror(m)
typer.echo(f"Mirror {slug!r} updated.")
@mirrors_app.command("remove")
def remove_mirror_cmd(
slug: str = typer.Argument(..., help="Mirror slug to remove."),
):
"""
Remove a mirror definition (does not delete files on disk).
"""
ok = storage.delete_mirror(slug)
if not ok:
typer.echo(f"No such mirror: {slug!r}", err=True)
raise typer.Exit(1)
typer.echo(f"Removed mirror {slug!r} from metadata.")
typer.echo("NOTE: this does not delete the mirrored files on disk.")
@mirrors_app.command("update")
def update_mirror_cmd(
slug: str = typer.Argument(..., help="Mirror slug to update."),
):
"""
Enqueue an update job for a single mirror (non-blocking).
"""
m = storage.get_mirror(slug)
if not m:
typer.echo(f"No such mirror: {slug!r}", err=True)
raise typer.Exit(1)
jobs.enqueue_update(slug)
typer.echo(f"Update job queued for {slug!r}.")
@mirrors_app.command("update-all")
def update_all_cmd():
"""
Enqueue update jobs for all mirrors (non-blocking).
"""
all_mirrors = storage.list_mirrors()
if not all_mirrors:
typer.echo("No mirrors configured.")
raise typer.Exit(0)
count = 0
for m in all_mirrors:
# Avoid spamming duplicates if already queued/updating
if m.status in ("queued", "updating"):
continue
jobs.enqueue_update(m.slug)
count += 1
typer.echo(f"Queued update jobs for {count} mirror(s).")
typer.echo("Daemon will process them in the background.")
@mirrors_app.command("status")
def status_cmd(
slug: Optional[str] = typer.Argument(
None,
help="Optional mirror slug. If omitted, show status for all mirrors.",
),
):
"""
Show current status for mirrors.
"""
if slug is None:
mirrors = storage.list_mirrors()
if not mirrors:
typer.echo("No mirrors configured.")
raise typer.Exit(0)
for m in mirrors:
cats = ", ".join(m.categories) if m.categories else "-"
status = m.status or "idle"
lu = m.last_updated.isoformat(
sep=" ", timespec="seconds") if m.last_updated else "never"
typer.echo(f"{m.slug:20} [{status:8}] {cats:25} {lu}")
if m.last_error:
typer.echo(f" last_error: {m.last_error}")
else:
m = storage.get_mirror(slug)
if not m:
typer.echo(f"No such mirror: {slug!r}", err=True)
raise typer.Exit(1)
typer.echo(f"slug : {m.slug}")
typer.echo(f"url : {m.url}")
typer.echo(f"categories : {', '.join(
m.categories) if m.categories else '-'}")
typer.echo(f"ignore_robots: {m.ignore_robots}")
typer.echo(f"status : {m.status or 'idle'}")
lu = m.last_updated.isoformat(
sep=" ", timespec="seconds") if m.last_updated else "never"
typer.echo(f"last_updated : {lu}")
if m.last_error:
typer.echo(f"last_error : {m.last_error}")
@mirrors_app.command("watch")
def watch_cmd(
slug: str = typer.Argument(..., help="Mirror slug to watch log for."),
lines: int = typer.Option(
40,
"--lines",
"-n",
help="Show this many trailing lines before following.",
),
):
"""
Tail the wget log for a mirror (like `tail -f`).
Ctrl-C exits the watch without stopping the update job.
"""
log_path = log_path_for(slug)
if not log_path.exists():
typer.echo(f"No log file yet for {slug!r}: {log_path}")
raise typer.Exit(1)
typer.echo(f"Watching log: {log_path}")
try:
with log_path.open("r", encoding="utf-8") as f:
# show last N lines
all_lines = f.readlines()
tail = all_lines[-lines:] if lines > 0 else all_lines
for line in tail:
typer.echo(line.rstrip("\n"))
# now follow
with log_path.open("r", encoding="utf-8") as f:
f.seek(0, os.SEEK_END)
while True:
where = f.tell()
line = f.readline()
if not line:
time.sleep(0.5)
f.seek(where)
else:
typer.echo(line.rstrip("\n"))
except KeyboardInterrupt:
typer.echo("\n[watch] Detaching from log.")

33
mirage/commands/misc.py Normal file
View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import typer
# type: ignore[attr-defined]
from ..config import load_config, _default_config_path
misc_app = typer.Typer(help="Miscellaneous commands (config, info).")
@misc_app.command("config-path")
def config_path_cmd():
"""
Show where the active config file is located (or would be created).
"""
# Slight hack: default path; real path reading is in load_config()
p = _default_config_path()
typer.echo(str(p))
@misc_app.command("config-show")
def config_show_cmd():
"""
Print the current configuration values.
"""
cfg = load_config()
typer.echo(f"mirror_root = {cfg.mirror_root}")
typer.echo(f"data_dir = {cfg.data_dir}")
typer.echo(f"log_dir = {cfg.log_dir}")
typer.echo(f"db_path = {cfg.db_path}")
typer.echo(f"max_concurrent_updates = {cfg.max_concurrent_updates}")
typer.echo(f"wget_bin = {cfg.wget_bin}")
typer.echo(f"rg_bin = {cfg.rg_bin}")

146
mirage/config.py Normal file
View File

@@ -0,0 +1,146 @@
from __future__ import annotations
import os
import tomllib # Python 3.11+; on 3.10 use 'tomli' instead
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
DEFAULT_MIRROR_ROOT = Path("/srv/www/mirrors")
DEFAULT_DATA_DIR = Path("~/.local/share/mirrorctl").expanduser()
DEFAULT_MAX_CONCURRENT_UPDATES = 4
DEFAULT_WGET_BIN = "wget"
DEFAULT_RG_BIN = "rg"
@dataclass
class Config:
mirror_root: Path
data_dir: Path
max_concurrent_updates: int
wget_bin: str
rg_bin: str
@property
def log_dir(self) -> Path:
d = self.data_dir / "logs"
d.mkdir(parents=True, exist_ok=True)
return d
@property
def db_path(self) -> Path:
self.data_dir.mkdir(parents=True, exist_ok=True)
return self.data_dir / "mirrors.json"
@property
def config_dir(self) -> Path:
# For future use (e.g. storing per-mirror configs)
d = self.data_dir / "config"
d.mkdir(parents=True, exist_ok=True)
return d
def default_config() -> Config:
return Config(
mirror_root=DEFAULT_MIRROR_ROOT,
data_dir=DEFAULT_DATA_DIR,
max_concurrent_updates=DEFAULT_MAX_CONCURRENT_UPDATES,
wget_bin=DEFAULT_WGET_BIN,
rg_bin=DEFAULT_RG_BIN,
)
def _default_config_path() -> Path:
xdg = os.getenv("XDG_CONFIG_HOME")
if xdg:
base = Path(xdg)
else:
base = Path("~/.config").expanduser()
return base / "mirrorctl" / "config.toml"
def _search_config_paths() -> list[Path]:
env = os.getenv("MIRRORCTL_CONFIG")
paths: list[Path] = []
if env:
paths.append(Path(env))
# user config
paths.append(_default_config_path())
# system config
paths.append(Path("/etc/mirrorctl/config.toml"))
return paths
def _ensure_default_config_file(path: Path, cfg: Config) -> None:
if path.exists():
return
path.parent.mkdir(parents=True, exist_ok=True)
content = f"""# mirrorctl configuration
# Directory where mirrors will be stored
mirror_root = "{cfg.mirror_root}"
# Directory for mirrorctl metadata (db, logs, etc.)
data_dir = "{cfg.data_dir}"
# Max parallel mirror updates
max_concurrent_updates = {cfg.max_concurrent_updates}
# Path to wget binary
wget_bin = "{cfg.wget_bin}"
# Path to ripgrep (rg) binary
rg_bin = "{cfg.rg_bin}"
"""
path.write_text(content, encoding="utf-8")
def load_config() -> Config:
"""
Load configuration from MIRRORCTL_CONFIG, XDG config, or /etc.
If no config exists, create a default one in
~/.config/mirrorctl/config.toml.
"""
cfg = default_config()
paths = _search_config_paths()
used_path: Optional[Path] = None
for p in paths:
if p.is_file():
used_path = p
break
if used_path is None:
# create default user config and read it back
user_path = _default_config_path()
_ensure_default_config_file(user_path, cfg)
used_path = user_path
data = {}
try:
raw = used_path.read_bytes()
data = tomllib.loads(raw.decode("utf-8"))
except Exception:
# Fall back to defaults if config is unreadable
data = {}
# Apply overrides from file
mirror_root = Path(data.get("mirror_root", cfg.mirror_root))
data_dir = Path(data.get("data_dir", cfg.data_dir))
max_concurrent = int(
data.get("max_concurrent_updates", cfg.max_concurrent_updates))
wget_bin = str(data.get("wget_bin", cfg.wget_bin))
rg_bin = str(data.get("rg_bin", cfg.rg_bin))
return Config(
mirror_root=mirror_root,
data_dir=data_dir,
max_concurrent_updates=max_concurrent,
wget_bin=wget_bin,
rg_bin=rg_bin,
)

78
mirage/daemon.py Normal file
View File

@@ -0,0 +1,78 @@
from __future__ import annotations
import time
from concurrent.futures import ThreadPoolExecutor, Future
from datetime import datetime
from typing import Dict, Tuple
from .config import load_config
from . import jobs
from . import storage
from .updater import update_mirror
def run_daemon(poll_interval: float = 1.0) -> None:
"""
Simple job-processing daemon.
- Watches jobs/pending for new update jobs.
- Moves them to jobs/running.
- Runs wget via update_mirror() with concurrency.
"""
cfg = load_config()
max_workers = max(1, cfg.max_concurrent_updates)
executor = ThreadPoolExecutor(max_workers=max_workers)
# Map Future -> (job_path, slug)
running: Dict[Future, Tuple[str, str]] = {}
print(f"[mirage-daemon] starting with max_workers={max_workers}")
print(f"[mirage-daemon] jobs dir: {cfg.data_dir / 'jobs'}")
try:
while True:
# 1. Collect finished jobs
finished = [f for f in running if f.done()]
for f in finished:
job_path, slug = running.pop(f)
# Remove job file from running
from pathlib import Path
try:
# type: ignore[arg-type]
Path(job_path).unlink(missing_ok=True)
except TypeError:
Path(job_path).unlink(missing_ok=True)
try:
f.result()
except Exception as e: # noqa: BLE001
# Internal failure => mark mirror as error
m = storage.get_mirror(slug)
if m:
m.status = "error"
m.last_error = f"Internal error: {e!r}"
m.last_updated = datetime.now()
storage.upsert_mirror(m)
# 2. If we have capacity, pull jobs from pending
capacity = max_workers - len(running)
if capacity > 0:
pending = jobs.list_pending_jobs()
if pending:
for pending_path, job in pending[:capacity]:
running_path = jobs.move_to_running(pending_path)
# mark mirror as updating early
m = storage.get_mirror(job.slug)
if m:
m.status = "updating"
m.last_error = None
storage.upsert_mirror(m)
fut = executor.submit(update_mirror, job.slug)
running[fut] = (str(running_path), job.slug)
time.sleep(poll_interval)
except KeyboardInterrupt:
print("[mirage-daemon] shutting down (KeyboardInterrupt)")
finally:
executor.shutdown(wait=False)

108
mirage/jobs.py Normal file
View File

@@ -0,0 +1,108 @@
from __future__ import annotations
import json
import uuid
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Tuple
from .config import load_config
from . import storage
@dataclass
class Job:
id: str
slug: str
type: str # currently only "update"
queued_at: datetime
def to_dict(self) -> dict:
return {
"id": self.id,
"slug": self.slug,
"type": self.type,
"queued_at": self.queued_at.isoformat(),
}
@classmethod
def from_dict(cls, data: dict) -> "Job":
return cls(
id=data["id"],
slug=data["slug"],
type=data["type"],
queued_at=datetime.fromisoformat(data["queued_at"]),
)
def _jobs_root() -> Path:
cfg = load_config()
root = cfg.data_dir / "jobs"
root.mkdir(parents=True, exist_ok=True)
(root / "pending").mkdir(exist_ok=True)
(root / "running").mkdir(exist_ok=True)
return root
def pending_dir() -> Path:
return _jobs_root() / "pending"
def running_dir() -> Path:
return _jobs_root() / "running"
def enqueue_update(slug: str) -> Path:
"""
Enqueue an update job for the given slug.
Mark mirror status as 'queued' (unless it's already queued/updating).
"""
job_id = uuid.uuid4().hex
job = Job(
id=job_id,
slug=slug,
type="update",
queued_at=datetime.now(),
)
pdir = pending_dir()
path = pdir / f"{job_id}.json"
with path.open("w", encoding="utf-8") as f:
json.dump(job.to_dict(), f)
m = storage.get_mirror(slug)
if m and m.status not in ("queued", "updating"):
m.status = "queued"
m.last_error = None
storage.upsert_mirror(m)
return path
def list_pending_jobs() -> List[Tuple[Path, Job]]:
jobs: List[Tuple[Path, Job]] = []
pdir = pending_dir()
for path in sorted(pdir.glob("*.json")):
try:
data = json.loads(path.read_text(encoding="utf-8"))
job = Job.from_dict(data)
except Exception:
continue
jobs.append((path, job))
return jobs
def load_job(path: Path) -> Job:
data = json.loads(path.read_text(encoding="utf-8"))
return Job.from_dict(data)
def move_to_running(pending_path: Path) -> Path:
"""
Move a pending job file into the running directory.
"""
rdir = running_dir()
dest = rdir / pending_path.name
pending_path.replace(dest)
return dest

View File

@@ -0,0 +1,3 @@
from .mirror import Mirror
__all__ = ["Mirror"]

58
mirage/models/mirror.py Normal file
View File

@@ -0,0 +1,58 @@
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Optional
@dataclass
class Mirror:
"""
Representation of a single mirrored site.
Attributes:
slug: Local identifier (and directory name under mirror_root).
url: Source URL to mirror.
categories: Arbitrary tags/categories (strings).
ignore_robots: Whether to disable robots.txt (wget robots=off).
status: Current status: "idle", "updating", "warning", "error".
last_updated: Last successful/attempted update timestamp.
last_error: Text description of last error/warning.
"""
slug: str
url: str
categories: List[str] = field(default_factory=list)
ignore_robots: bool = False
status: str = "idle"
last_updated: Optional[datetime] = None
last_error: Optional[str] = None
def to_dict(self) -> dict:
if self.last_updated:
update = self.last_updated.isoformat()
else:
update = None
return {
"slug": self.slug,
"url": self.url,
"categories": self.categories,
"ignore_robots": self.ignore_robots,
"status": self.status,
"last_updated": update,
"last_error": self.last_error,
}
@classmethod
def from_dict(cls, data: dict) -> "Mirror":
lu = data.get("last_updated")
last_updated = datetime.fromisoformat(lu) if lu else None
return cls(
slug=data["slug"],
url=data["url"],
categories=data.get("categories", []),
ignore_robots=data.get("ignore_robots", False),
status=data.get("status", "idle"),
last_updated=last_updated,
last_error=data.get("last_error"),
)

144
mirage/search.py Normal file
View File

@@ -0,0 +1,144 @@
from __future__ import annotations
import html
import re
import subprocess
from pathlib import Path
from typing import List, Dict
from .config import load_config
def strip_html(text: str) -> str:
# Remove script and style blocks
text = re.sub(
r"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>",
" ",
text,
flags=re.IGNORECASE,
)
text = re.sub(
r"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>",
" ",
text,
flags=re.IGNORECASE,
)
# Strip all remaining tags
text = re.sub(r"<[^>]+>", " ", text)
# Unescape HTML entities (&amp; -> &)
text = html.unescape(text)
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
def make_snippet(text: str,
query: str,
radius: int = 80,
max_len: int = 240) -> str:
if not text:
return ""
lower = text.lower()
qlower = query.lower()
idx = lower.find(qlower)
if idx == -1:
snippet = text[:max_len]
if len(text) > max_len:
snippet += ""
return snippet
start = max(0, idx - radius)
end = min(len(text), idx + len(query) + radius)
snippet = text[start:end]
if start > 0:
snippet = "" + snippet
if end < len(text):
snippet += ""
return snippet
def search_content(query: str, limit: int = 50) -> List[Dict]:
"""
Run ripgrep across all mirrors and return text snippets around matches.
Returns list of dicts:
{
"path": "slug/host/path/file.html",
"line": 42,
"snippet": "text around the query ..."
}
"""
query = (query or "").strip()
if not query:
return []
cfg = load_config()
root = cfg.mirror_root
try:
proc = subprocess.run(
[
cfg.rg_bin,
"--line-number",
"--no-heading",
"--color",
"never",
"--max-count",
"5", # per file
"--type-add",
"page:*.{html,htm,md,markdown,txt}",
"-tpage",
query,
str(root),
],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
timeout=10,
)
except FileNotFoundError:
return [{
"path": "(error)",
"line": 0,
"snippet": f"ripgrep binary not found: {cfg.rg_bin!r}",
}]
except subprocess.TimeoutExpired:
return [{
"path": "(error)",
"line": 0,
"snippet": "rg timed out.",
}]
results: List[Dict] = []
for line in proc.stdout.splitlines():
parts = line.split(":", 2)
if len(parts) != 3:
continue
path_str, lineno_str, raw_content = parts
text_content = strip_html(raw_content)
if not text_content:
continue
snippet = make_snippet(text_content, query)
try:
rel_path = str(Path(path_str).relative_to(root))
except ValueError:
rel_path = path_str
try:
lineno = int(lineno_str)
except ValueError:
lineno = 0
results.append({
"path": rel_path,
"line": lineno,
"snippet": snippet,
})
if len(results) >= limit:
break
return results

60
mirage/storage.py Normal file
View File

@@ -0,0 +1,60 @@
from __future__ import annotations
import json
from threading import RLock
from typing import Dict, List, Optional
from .config import load_config
from .models import Mirror
_lock = RLock()
def _load_raw(path) -> Dict[str, dict]:
if not path.exists():
return {}
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def _save_raw(path, data: Dict[str, dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(".tmp")
with tmp.open("w", encoding="utf-8") as f:
json.dump(data, f, indent=2, sort_keys=True)
tmp.replace(path)
def list_mirrors() -> List[Mirror]:
cfg = load_config()
with _lock:
data = _load_raw(cfg.db_path)
return [Mirror.from_dict(v) for v in data.values()]
def get_mirror(slug: str) -> Optional[Mirror]:
cfg = load_config()
with _lock:
data = _load_raw(cfg.db_path)
if slug not in data:
return None
return Mirror.from_dict(data[slug])
def upsert_mirror(m: Mirror) -> None:
cfg = load_config()
with _lock:
data = _load_raw(cfg.db_path)
data[m.slug] = m.to_dict()
_save_raw(cfg.db_path, data)
def delete_mirror(slug: str) -> bool:
cfg = load_config()
with _lock:
data = _load_raw(cfg.db_path)
if slug not in data:
return False
del data[slug]
_save_raw(cfg.db_path, data)
return True

148
mirage/updater.py Normal file
View File

@@ -0,0 +1,148 @@
from __future__ import annotations
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import Iterable, Tuple, Dict
from .config import load_config
from .models import Mirror
from . import storage
def mirror_dir_for(slug: str) -> Path:
cfg = load_config()
d = cfg.mirror_root / slug
d.mkdir(parents=True, exist_ok=True)
return d
def log_path_for(slug: str) -> Path:
cfg = load_config()
return cfg.log_dir / f"{slug}.log"
def _write_log_header(log_path: Path, cmd: list[str]) -> None:
now = datetime.now().isoformat()
with log_path.open("a", encoding="utf-8") as log:
log.write(f"\n=== {now} Running: {' '.join(cmd)}\n")
log.flush()
def run_wget(mirror: Mirror) -> Tuple[int, Path]:
"""
Run wget for a single mirror, appending logs to its log file.
Returns:
(exit_code, log_path)
"""
cfg = load_config()
target_dir = mirror_dir_for(mirror.slug)
log_path = log_path_for(mirror.slug)
cmd = [
cfg.wget_bin,
"--mirror",
"--convert-links",
"--page-requisites",
"--no-parent",
"--adjust-extension",
f"--execute=robots={'off' if mirror.ignore_robots else 'on'}",
"--directory-prefix",
str(target_dir),
mirror.url,
]
_write_log_header(log_path, cmd)
with log_path.open("a", encoding="utf-8") as log:
proc = subprocess.run(
cmd,
stdout=log,
stderr=log,
text=True,
)
return proc.returncode, log_path
def update_mirror(slug: str) -> Mirror:
"""
Update a single mirror by slug and persist its status.
Returns:
Updated Mirror instance.
"""
m = storage.get_mirror(slug)
if not m:
raise ValueError(f"Unknown mirror: {slug!r}")
# Mark as updating
m.status = "updating"
m.last_error = None
storage.upsert_mirror(m)
code, log_path = run_wget(m)
# Reload to avoid overwriting concurrent changes
m = storage.get_mirror(slug) or m
if code == 0:
m.status = "idle"
elif code == 4:
# network issues -> warning
m.status = "warning"
m.last_error = f"wget exited with code {code}, see {log_path}"
else:
m.status = "error"
m.last_error = f"wget exited with code {code}, see {log_path}"
m.last_updated = datetime.now()
storage.upsert_mirror(m)
return m
def update_all_concurrent(
slugs: Iterable[str] | None = None) -> Dict[str, Mirror]:
"""
Update multiple mirrors concurrently.
Args:
slugs: Iterable of slugs to update. If None, update all mirrors.
Returns:
Mapping slug -> updated Mirror.
"""
cfg = load_config()
if slugs is None:
slugs = [m.slug for m in storage.list_mirrors()]
slugs = list(slugs)
results: Dict[str, Mirror] = {}
if not slugs:
return results
max_workers = max(1, cfg.max_concurrent_updates)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_slug = {executor.submit(
update_mirror, slug): slug for slug in slugs}
for future in as_completed(future_to_slug):
slug = future_to_slug[future]
try:
m = future.result()
results[slug] = m
except Exception as e: # noqa: BLE001
# If update fails badly, mark error
m = storage.get_mirror(slug)
if m:
m.status = "error"
m.last_error = f"Internal error: {e!r}"
m.last_updated = datetime.now()
storage.upsert_mirror(m)
results[slug] = m
return results

View File

@@ -1,190 +0,0 @@
#!/usr/bin/env python3
import json
import subprocess
import datetime as dt
from pathlib import Path
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
BASE = Path("/srv/www")
DATA_FILE = BASE / "data" / "mirrors.json"
MIRROR_ROOT = BASE / "mirrors"
LOG_ROOT = BASE / "logs"
MIRROR_ROOT.mkdir(parents=True, exist_ok=True)
LOG_ROOT.mkdir(parents=True, exist_ok=True)
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
_LOCK = threading.Lock()
def _now_iso() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
def load_mirrors() -> list[dict]:
with _LOCK:
if not DATA_FILE.exists():
return []
with DATA_FILE.open("r", encoding="utf-8") as f:
return json.load(f)
def save_mirrors(mirrors: list[dict]) -> None:
with _LOCK:
tmp = DATA_FILE.with_suffix(".tmp")
with tmp.open("w", encoding="utf-8") as f:
json.dump(mirrors, f, indent=2)
tmp.replace(DATA_FILE)
def get_mirror(mirrors: list[dict], slug: str) -> dict | None:
for m in mirrors:
if m["slug"] == slug:
return m
return None
def _normalise_categories(raw: str) -> list[str]:
# "tutorials, wgpu, rust" -> ["tutorials","wgpu","rust"]
parts = [p.strip() for p in raw.split(",")]
return [p for p in parts if p]
def add_mirror(slug: str,
categories: str,
url: str,
ignore_robots: bool = False) -> dict:
mirrors = load_mirrors()
if get_mirror(mirrors, slug) is not None:
raise ValueError(f"Mirror with slug '{slug}' already exists")
cats = _normalise_categories(categories)
if not cats:
raise ValueError("At least one category is required")
m = {
"slug": slug,
"categories": cats,
"url": url,
"ignore_robots": bool(ignore_robots),
"created_at": _now_iso(),
"last_updated": None,
"status": "queued", # idle | updating | queued | warning | error
"last_error": None,
}
mirrors.append(m)
save_mirrors(mirrors)
return m
def _set_status(slug: str, *,
status: str,
last_error: str | None = None,
last_updated: str | None = None):
mirrors = load_mirrors()
m = get_mirror(mirrors, slug)
if m is None:
return
m["status"] = status
if last_error is not None:
m["last_error"] = last_error
if last_updated is not None:
m["last_updated"] = last_updated
save_mirrors(mirrors)
def update_mirror(slug: str) -> None:
"""Run wget mirror for a single slug (blocking in this thread)."""
mirrors = load_mirrors()
m = get_mirror(mirrors, slug)
if m is None:
raise ValueError(f"No such mirror: {slug}")
_set_status(slug, status="updating", last_error=None)
target_dir = MIRROR_ROOT / slug
target_dir.mkdir(parents=True, exist_ok=True)
log_file = LOG_ROOT / f"{slug}.log"
robots_setting = "off" if m.get("ignore_robots") else "on"
cmd = [
"wget",
"--mirror", # recurse, keep timestamps
"--convert-links",
"--adjust-extension",
"--page-requisites",
"--no-parent",
"--wait=0.5",
"--random-wait",
"--limit-rate=50m",
"--tries=3",
"--retry-connrefused",
f"--execute=robots={robots_setting}",
"-P",
str(target_dir),
m["url"],
]
try:
with log_file.open("a", encoding="utf-8") as lf:
lf.write(f"\n=== {_now_iso()} : Starting mirror of {
m['url']} ===\n")
lf.flush()
proc = subprocess.run(
cmd,
stdout=lf,
stderr=subprocess.STDOUT,
)
lf.write(f"=== {_now_iso()} : wget exited with code {
proc.returncode} ===\n")
lf.flush()
# Classify result
if proc.returncode == 0:
_set_status(slug, status="idle",
last_updated=_now_iso(), last_error=None)
else:
# If we see FINISHED in the log and the directory has content,
# treat this as a partial/ok-with-warnings case.
text = log_file.read_text(encoding="utf-8", errors="ignore")
has_finished = "FINISHED --" in text
has_files = any(target_dir.rglob("*"))
if has_finished and has_files:
_set_status(
slug,
status="warning",
last_updated=_now_iso(),
last_error=f"wget exited with {
proc.returncode} (partial; see log)",
)
else:
_set_status(
slug,
status="error",
last_error=f"wget exited with {proc.returncode}",
)
except Exception as e:
_set_status(
slug,
status="error",
last_error=f"{type(e).__name__}: {e}",
)
def update_all_mirrors(max_workers: int = 3) -> None:
mirrors = load_mirrors()
slugs = [m["slug"] for m in mirrors]
if not slugs:
return
# Run several in parallel
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(update_mirror, slug): slug for slug in slugs}
for fut in as_completed(futures):
slug = futures[fut]
try:
fut.result()
except Exception as e:
_set_status(slug, status="error", last_error=f"{
type(e).__name__}: {e}")

1067
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,23 +0,0 @@
{
"devDependencies": {
"tailwindcss": "^4.1.17"
},
"name": "www",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "https://git.nytegear.com/aargonian/nytegear-mirror-websites.git"
},
"keywords": [],
"author": "",
"license": "ISC",
"type": "commonjs",
"dependencies": {
"@tailwindcss/cli": "^4.1.17"
}
}

19
pyproject.toml Normal file
View File

@@ -0,0 +1,19 @@
[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "mirage"
version = "0.1.0"
description = "Mirror management that's too good to be true."
authors = [{ name = "Aaron Gorodetzky", email = "aaron@nytegear.com" }]
requires-python = ">=3.10"
dependencies = [
"typer[all]>=0.12.0",
]
[project.scripts]
mirage = "mirage.cli:app"
[tool.setuptools]
packages = ["mirage"]

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,6 @@
[Unit]
Description=Enqueue periodic updates for Mirage mirrors
[Service]
Type=oneshot
ExecStart=/usr/bin/mirage mirrors update-all

View File

@@ -1,10 +1,11 @@
# systemd/mirage-update.timer
[Unit]
Description=Daily update of offline mirrors
Description=Run Mirage mirror updates periodically
[Timer]
OnCalendar=03:00
Persistent=true
Unit=update-mirrors.service
[Install]
WantedBy=timers.target

15
systemd/mirage.service Normal file
View File

@@ -0,0 +1,15 @@
[Unit]
Description=Mirage mirror daemon
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=mirage
Group=mirage
ExecStart=/usr/bin/mirage daemon
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -1,15 +0,0 @@
[Unit]
Description=Mirror Manager Flask App
After=network.target
[Service]
User=aargonian
Group=aargonian
WorkingDirectory=/srv/www
Environment="FLASK_ENV=production"
ExecStart=/usr/bin/python3 /srv/www/app.py
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -1,9 +0,0 @@
[Unit]
Description=Update Offline Website Mirrors
[Service]
Type=oneshot
User=aargonian
Group=aargonian
WorkingDirectory=/srv/www
ExecStart=/usr/bin/python3 /srv/www/update_mirrors.py

View File

@@ -1 +0,0 @@
@import "tailwindcss";

View File

@@ -1,16 +0,0 @@
#!/usr/bin/env python3
import sys
from mirror_manager import update_all_mirrors, update_mirror
def main():
if len(sys.argv) == 2:
slug = sys.argv[1]
update_mirror(slug)
else:
# bump max_workers if you're feeling brave / bandwidth-rich
update_all_mirrors(max_workers=8)
if __name__ == "__main__":
main()