Significant improvement on the app

This commit is contained in:
2025-12-02 02:29:02 -05:00
parent 9cea410c18
commit e817265e8a
7 changed files with 905 additions and 45 deletions

217
.gitignore vendored
View File

@@ -1,2 +1,219 @@
index.html* index.html*
/mirrors /mirrors
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
# Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
# poetry.lock
# poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
# pdm.lock
# pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
# pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# Redis
*.rdb
*.aof
*.pid
# RabbitMQ
mnesia/
rabbitmq/
rabbitmq-data/
# ActiveMQ
activemq-data/
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
# .idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# Streamlit
.streamlit/secrets.toml

523
app.py Executable file
View File

@@ -0,0 +1,523 @@
#!/usr/bin/env python3
from flask import Flask, request, redirect, url_for, jsonify, render_template_string, abort
import threading
from mirror_manager import (
load_mirrors,
add_mirror,
update_mirror,
LOG_ROOT,
)
app = Flask(__name__)
# --- background update helper ---
def _run_update_in_background(slug: str):
th = threading.Thread(target=update_mirror, args=(slug,), daemon=True)
th.start()
# --- templates ---
INDEX_TEMPLATE = r"""
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>Mirror Manager</title>
<style>
:root {
color-scheme: dark light;
}
body {
font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
margin: 0;
padding: 0;
background: #0f172a;
color: #e5e7eb;
}
main {
max-width: 1100px;
margin: 2rem auto;
padding: 0 1rem 3rem;
}
header {
display: flex;
flex-wrap: wrap;
align-items: baseline;
gap: 0.5rem 1rem;
justify-content: space-between;
margin-bottom: 1.5rem;
}
h1 {
font-size: 1.75rem;
margin: 0;
}
.subtitle { color: #9ca3af; font-size: 0.9rem; }
.card {
background: #020617;
border-radius: 0.75rem;
padding: 1rem 1.2rem;
box-shadow: 0 10px 30px rgba(0,0,0,0.4);
border: 1px solid #1f2937;
}
.grid {
display: grid;
grid-template-columns: minmax(0, 2fr) minmax(0, 3fr);
gap: 1rem;
align-items: flex-start;
}
@media (max-width: 900px) {
.grid {
grid-template-columns: minmax(0, 1fr);
}
}
label {
display: block;
font-size: 0.8rem;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #9ca3af;
margin-bottom: 0.25rem;
}
input[type=text], select {
width: 100%;
padding: 0.4rem 0.5rem;
border-radius: 0.5rem;
border: 1px solid #374151;
background: #020617;
color: #e5e7eb;
font-size: 0.9rem;
}
input[type=text]:focus, select:focus {
outline: none;
border-color: #3b82f6;
box-shadow: 0 0 0 1px #3b82f6;
}
.btn {
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.4rem;
padding: 0.5rem 0.9rem;
border-radius: 999px;
border: none;
cursor: pointer;
font-size: 0.9rem;
font-weight: 500;
}
.btn-primary {
background: linear-gradient(135deg, #3b82f6, #8b5cf6);
color: white;
}
.btn-secondary {
background: transparent;
border: 1px solid #374151;
color: #e5e7eb;
}
.btn[disabled] {
opacity: 0.5;
cursor: default;
}
.toolbar {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 0.75rem;
align-items: center;
justify-content: space-between;
}
.toolbar-left, .toolbar-right {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
align-items: center;
}
.pill {
font-size: 0.8rem;
padding: 0.25rem 0.6rem;
border-radius: 999px;
border: 1px solid #374151;
background: #020617;
cursor: pointer;
}
.pill.active {
background: #3b82f6;
border-color: #3b82f6;
color: white;
}
table {
width: 100%;
border-collapse: collapse;
font-size: 0.9rem;
}
th, td {
padding: 0.45rem 0.5rem;
text-align: left;
border-bottom: 1px solid #111827;
vertical-align: middle;
}
th {
font-size: 0.75rem;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #9ca3af;
}
tr:hover td {
background: rgba(31,41,55,0.6);
}
code {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 0.8rem;
}
.badge {
font-size: 0.75rem;
padding: 0.1rem 0.5rem;
border-radius: 999px;
text-transform: uppercase;
letter-spacing: 0.06em;
}
.badge-idle { background: #065f46; color: #a7f3d0; }
.badge-updating { background: #92400e; color: #fed7aa; }
.badge-error { background: #7f1d1d; color: #fecaca; }
.badge-queued { background: #1f2937; color: #e5e7eb; }
.status-dot {
width: 0.6rem;
height: 0.6rem;
border-radius: 999px;
display: inline-block;
margin-right: 0.3rem;
}
.status-idle { background: #22c55e; }
.status-updating { background: #f97316; animation: pulse 1.2s infinite; }
.status-error { background: #ef4444; }
.status-queued { background: #6b7280; }
@keyframes pulse {
0% { transform: scale(1); opacity: 1; }
50% { transform: scale(1.25); opacity: 0.7; }
100% { transform: scale(1); opacity: 1; }
}
.log-link {
font-size: 0.8rem;
color: #93c5fd;
text-decoration: none;
}
.log-link:hover {
text-decoration: underline;
}
.muted { color: #6b7280; font-size: 0.8rem; }
.search-input {
min-width: 220px;
}
</style>
</head>
<body>
<main>
<header>
<div>
<h1>Mirror Manager</h1>
<div class="subtitle">Local archive of external sites, grouped by category.</div>
</div>
</header>
<div class="grid">
<!-- Left: mirror list -->
<section class="card">
<div class="toolbar">
<div class="toolbar-left">
<span class="muted">Categories:</span>
<button class="pill active" data-category="all">All ({{ mirrors|length }})</button>
{% for cat in categories %}
<button class="pill" data-category="{{ cat }}">{{ cat }}</button>
{% endfor %}
</div>
<div class="toolbar-right">
<input type="text" id="search" class="search-input" placeholder="Search slug / URL / category…">
</div>
</div>
<table id="mirror-table">
<thead>
<tr>
<th>Slug</th>
<th>Category</th>
<th>URL</th>
<th>Last updated</th>
<th>Status</th>
<th></th>
</tr>
</thead>
<tbody>
{% for m in mirrors %}
<tr data-slug="{{ m.slug }}" data-category="{{ m.category }}" data-search="{{ (m.slug ~ ' ' ~ m.category ~ ' ' ~ m.url)|lower }}">
<td>
<a href="/mirrors/{{ m.slug }}/" target="_blank">
<code>{{ m.slug }}</code>
</a>
</td>
<td>{{ m.category }}</td>
<td><code>{{ m.url }}</code></td>
<td>
{% if m.last_updated %}
<span title="{{ m.last_updated_raw }}">{{ m.last_updated }}</span>
{% else %}
<span class="muted">never</span>
{% endif %}
</td>
<td>
{% set st = m.status or 'idle' %}
<span class="status-dot status-{{ st }}"></span>
<span class="badge badge-{{ st }}">{{ st }}</span>
</td>
<td>
<a class="log-link" href="{{ url_for('view_log', slug=m.slug) }}" target="_blank">log</a>
&nbsp;·&nbsp;
<form method="post" action="{{ url_for('trigger_update', slug=m.slug) }}" style="display:inline;">
<button class="btn btn-secondary" style="padding:0.2rem 0.6rem; font-size:0.75rem;">Update</button>
</form>
</td>
</tr>
{% endfor %}
{% if mirrors|length == 0 %}
<tr><td colspan="6" class="muted">No mirrors yet. Add one on the right.</td></tr>
{% endif %}
</tbody>
</table>
</section>
<!-- Right: add mirror -->
<section class="card">
<h2 style="margin-top:0; font-size:1.1rem;">Add mirror</h2>
<form method="post" action="{{ url_for('add_mirror_route') }}">
<div style="margin-bottom:0.6rem;">
<label for="slug">Slug</label>
<input type="text" id="slug" name="slug" required placeholder="e.g. python_tutorial">
</div>
<div style="margin-bottom:0.6rem;">
<label for="category">Category</label>
<input type="text" id="category" name="category" required placeholder="e.g. tutorial, docs, blog">
</div>
<div style="margin-bottom:0.6rem;">
<label for="url">URL</label>
<input type="text" id="url" name="url" required placeholder="https://example.com/some/path/">
</div>
<div style="margin-bottom:0.8rem;">
<label style="display:flex; align-items:center; gap:0.4rem;">
<input type="checkbox" name="ignore_robots" value="1">
<span style="text-transform:none; letter-spacing:0; font-size:0.85rem;">
Ignore robots.txt (not recommended unless you know you need it)
</span>
</label>
</div>
{% if error %}
<div style="color:#fecaca; font-size:0.85rem; margin-bottom:0.5rem;">{{ error }}</div>
{% endif %}
<button type="submit" class="btn btn-primary">Add &amp; mirror</button>
<p class="muted" style="margin-top:0.5rem;">
New mirrors are cloned in the background. Status will show as <strong>updating</strong> until done.
</p>
</form>
</section>
</div>
</main>
<script>
// category filter
const pills = Array.from(document.querySelectorAll('.pill'));
const rows = Array.from(document.querySelectorAll('#mirror-table tbody tr'));
const searchInput = document.getElementById('search');
function applyFilters() {
const activePill = pills.find(p => p.classList.contains('active'));
const cat = activePill ? activePill.dataset.category : 'all';
const q = (searchInput.value || '').toLowerCase();
rows.forEach(row => {
const rowCat = row.dataset.category;
const searchStr = row.dataset.search;
const matchCat = (cat === 'all' || rowCat === cat);
const matchSearch = (!q || searchStr.includes(q));
row.style.display = (matchCat && matchSearch) ? '' : 'none';
});
}
pills.forEach(p => {
p.addEventListener('click', () => {
pills.forEach(x => x.classList.remove('active'));
p.classList.add('active');
applyFilters();
});
});
searchInput.addEventListener('input', () => {
applyFilters();
});
// polling for live status
async function pollStatus() {
try {
const resp = await fetch("{{ url_for('status') }}");
if (!resp.ok) return;
const data = await resp.json();
const bySlug = {};
data.mirrors.forEach(m => bySlug[m.slug] = m);
rows.forEach(row => {
const slug = row.dataset.slug;
const m = bySlug[slug];
if (!m) return;
const tds = row.querySelectorAll('td');
// last updated
const lastUpdatedCell = tds[3];
lastUpdatedCell.innerHTML = m.last_updated_display || '<span class="muted">never</span>';
// status
const statusCell = tds[4];
const st = m.status || 'idle';
statusCell.innerHTML =
'<span class="status-dot status-' + st + '"></span>' +
'<span class="badge badge-' + st + '">' + st + '</span>';
});
} catch (e) {
// ignore
}
}
setInterval(pollStatus, 5000);
</script>
</body>
</html>
"""
# --- routes ---
@app.route("/", methods=["GET"])
def index():
mirrors = load_mirrors()
categories = sorted({m["category"] for m in mirrors})
# format last_updated nicely
rows = []
for m in mirrors:
last_disp = None
raw = m.get("last_updated")
if raw:
last_disp = raw.replace("T", " ").replace("Z", " UTC")
rows.append({
"slug": m["slug"],
"category": m["category"],
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": last_disp,
})
return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=categories, error=None)
@app.route("/add", methods=["POST"])
def add_mirror_route():
slug = (request.form.get("slug") or "").strip()
category = (request.form.get("category") or "").strip()
url = (request.form.get("url") or "").strip()
ignore_robots = bool(request.form.get("ignore_robots"))
error = None
if not slug or not category or not url:
error = "Slug, category, and URL are required."
elif " " in slug:
error = "Slug cannot contain spaces."
if error:
mirrors = load_mirrors()
categories = sorted({m["category"] for m in mirrors})
rows = []
for m in mirrors:
raw = m.get("last_updated")
last_disp = raw.replace("T", " ").replace(
"Z", " UTC") if raw else None
rows.append({
"slug": m["slug"],
"category": m["category"],
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": last_disp,
})
return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=categories, error=error), 400
try:
add_mirror(slug, category, url, ignore_robots=ignore_robots)
except Exception as e:
mirrors = load_mirrors()
categories = sorted({m["category"] for m in mirrors})
rows = []
for m in mirrors:
raw = m.get("last_updated")
last_disp = raw.replace("T", " ").replace(
"Z", " UTC") if raw else None
rows.append({
"slug": m["slug"],
"category": m["category"],
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated_raw": raw,
"last_updated": last_disp,
})
return render_template_string(INDEX_TEMPLATE,
mirrors=rows,
categories=categories,
error=str(e)), 400
# kick off background update
_run_update_in_background(slug)
return redirect(url_for("index"))
@app.route("/update/<slug>", methods=["POST"])
def trigger_update(slug):
# fire-and-forget; UI will see status flip to 'updating'
_run_update_in_background(slug)
return redirect(url_for("index"))
@app.route("/status", methods=["GET"])
def status():
mirrors = load_mirrors()
out = []
for m in mirrors:
raw = m.get("last_updated")
last_disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None
out.append({
"slug": m["slug"],
"category": m["category"],
"url": m["url"],
"status": m.get("status") or "idle",
"last_updated": raw,
"last_updated_display": last_disp or "",
})
return jsonify({"mirrors": out})
@app.route("/logs/<slug>")
def view_log(slug):
log_path = LOG_ROOT / f"{slug}.log"
if not log_path.exists():
abort(404)
text = log_path.read_text(encoding="utf-8", errors="replace")
return "<pre>" + (text.replace("&", "&amp;").replace("<", "&lt;")) + "</pre>"
if __name__ == "__main__":
app.run(host="127.0.0.1", port=5000, debug=False)

12
data/mirrors.json Normal file
View File

@@ -0,0 +1,12 @@
[
{
"slug": "wgpu-tutorial",
"category": "rust",
"url": "https://sotrh.github.io/learn-wgpu/",
"ignore_robots": false,
"created_at": "2025-12-02T07:15:12Z",
"last_updated": null,
"status": "error",
"last_error": "wget exited with 4"
}
]

138
mirror_manager.py Executable file
View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
Manage the various mirrors for the mirror website.
"""
import json
import subprocess
import datetime as dt
from pathlib import Path
BASE = Path("/srv/www")
DATA_FILE = BASE / "data" / "mirrors.json"
MIRROR_ROOT = BASE / "mirrors"
LOG_ROOT = BASE / "logs"
MIRROR_ROOT.mkdir(parents=True, exist_ok=True)
LOG_ROOT.mkdir(parents=True, exist_ok=True)
DATA_FILE.parent.mkdir(parents=True, exist_ok=True)
def _now_iso() -> str:
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
def load_mirrors() -> list[dict]:
if not DATA_FILE.exists():
return []
with DATA_FILE.open("r", encoding="utf-8") as f:
return json.load(f)
def save_mirrors(mirrors: list[dict]) -> None:
tmp = DATA_FILE.with_suffix(".tmp")
with tmp.open("w", encoding="utf-8") as f:
json.dump(mirrors, f, indent=2)
tmp.replace(DATA_FILE)
def get_mirror(mirrors: list[dict], slug: str) -> dict | None:
for m in mirrors:
if m["slug"] == slug:
return m
return None
def add_mirror(slug: str,
category: str,
url: str,
ignore_robots: bool = False) -> dict:
mirrors = load_mirrors()
if get_mirror(mirrors, slug) is not None:
raise ValueError(f"Mirror with slug '{slug}' already exists!")
m = {
"slug": slug,
"category": category,
"url": url,
"ignore_robots": bool(ignore_robots),
"created_at": _now_iso(),
"last_updated": None,
"status": "queued",
"last_error": None,
}
mirrors.append(m)
save_mirrors(mirrors)
return m
def update_mirror(slug: str) -> None:
"""Run wget mirror for a singel slug (blocking)."""
mirrors = load_mirrors()
m = get_mirror(mirrors, slug)
if m is None:
raise ValueError(f"No such mirror: {slug}")
m["status"] = "updating"
m["last_error"] = None
save_mirrors(mirrors)
target_dir = MIRROR_ROOT / slug
target_dir.mkdir(parents=True, exist_ok=True)
log_file = LOG_ROOT / f"{slug}.log"
robots_setting = "off" if m.get("ignore_robots") else "on"
# Polite wget:
# --mirror implies -r -N -l inf --no-remove-listing
cmd = [
"wget",
"--mirror",
"--convert-links",
"--adjust-extension",
"--page-requisites",
"--no-parent",
"--wait=0.70",
"--random-wait",
# "--limit-rate=50m",
f"execute=robots={robots_setting}",
"-P",
str(target_dir),
m["url"],
]
try:
with log_file.open("a", encoding="utf-8") as lf:
lf.write(f"\n=== {_now_iso()} : "
f"Starting mirror of {m['url']} ===\n")
lf.flush()
subprocess.run(
cmd,
stdout=lf,
stderr=subprocess.STDOUT,
check=True,
)
lf.write(f"=== {_now_iso()} : Completed mirror of {m['url']} ===\n")
lf.flush()
m["last_updated"] = _now_iso()
m["status"] = "idle"
m["last_error"] = None
except subprocess.CalledProcessError as e:
m["status"] = "error"
m["last_error"] = f"wget exited with {e.returncode}"
with log_file.open("a", encoding="utf-8") as lf:
lf.write(f"*** ERROR: wget failed with code {e.returncode}\n")
except Exception as e:
m["status"] = "error"
m["last_error"] = f"{type(e).__name__}: {e}"
with log_file.open("a", encoding="utf-8") as lf:
lf.write(f"*** ERROR: {type(e).__name__}: {e}\n")
finally:
save_mirrors(mirrors)
def update_all_mirrors() -> None:
mirrors = load_mirrors()
for m in mirrors:
update_mirror(m["slug"])

View File

@@ -1,2 +0,0 @@
# Slug URL
wgpu-tutorial https://sotrh.github.io/learn-wgpu/

15
update_mirrors.py Executable file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python3
import sys
from mirror_manager import update_all_mirrors, update_mirror
def main():
if len(sys.argv) == 2:
slug = sys.argv[1]
update_mirror(slug)
else:
update_all_mirrors()
if __name__ == "__main__":
main()

View File

@@ -1,43 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
BASE="/srv/www"
URL_LIST="$BASE/mirrors.txt"
OUTDIR="$BASE/mirrors"
mkdir -p "$OUTDIR"
# If a slug is passed as an argument, only update that one.
ONLY_SLUG="${1:-}"
while read -r slug url; do
# skip empty lines & comments
[ -z "${slug:-}" ] && continue
[[ "$slug" =~ ^# ]] && continue
if [ -n "$ONLY_SLUG" ] && [ "$slug" != "$ONLY_SLUG" ]; then
continue
fi
echo "=== Mirroring $slug ($url) ==="
# Each mirror in its own directory
TARGET_DIR="$OUTDIR/$slug"
mkdir -p "$TARGET_DIR"
cd "$TARGET_DIR"
# Mirror site
wget \
--mirror \
--convert-links \
--adjust-extension \
--page-requisites \
--no-parent \
"$url"
echo "=== Done $slug ==="
done < "$URL_LIST"
# Regenerate index page
cd "$BASE"
python3 "$BASE/generate_index.py"