diff --git a/.gitignore b/.gitignore index ad6539f..0ee5b63 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,219 @@ index.html* /mirrors + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/app.py b/app.py new file mode 100755 index 0000000..90523f8 --- /dev/null +++ b/app.py @@ -0,0 +1,523 @@ +#!/usr/bin/env python3 +from flask import Flask, request, redirect, url_for, jsonify, render_template_string, abort +import threading +from mirror_manager import ( + load_mirrors, + add_mirror, + update_mirror, + LOG_ROOT, +) + +app = Flask(__name__) + +# --- background update helper --- + + +def _run_update_in_background(slug: str): + th = threading.Thread(target=update_mirror, args=(slug,), daemon=True) + th.start() + +# --- templates --- + + +INDEX_TEMPLATE = r""" + + + + + Mirror Manager + + + +
+
+
+

Mirror Manager

+
Local archive of external sites, grouped by category.
+
+
+ +
+ +
+
+
+ Categories: + + {% for cat in categories %} + + {% endfor %} +
+
+ +
+
+ + + + + + + + + + + + + + {% for m in mirrors %} + + + + + + + + + {% endfor %} + {% if mirrors|length == 0 %} + + {% endif %} + +
SlugCategoryURLLast updatedStatus
+ + {{ m.slug }} + + {{ m.category }}{{ m.url }} + {% if m.last_updated %} + {{ m.last_updated }} + {% else %} + never + {% endif %} + + {% set st = m.status or 'idle' %} + + {{ st }} + + log +  ·  +
+ +
+
No mirrors yet. Add one on the right.
+
+ + +
+

Add mirror

+
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ {% if error %} +
{{ error }}
+ {% endif %} + +

+ New mirrors are cloned in the background. Status will show as updating until done. +

+
+
+
+
+ + + + +""" + +# --- routes --- + + +@app.route("/", methods=["GET"]) +def index(): + mirrors = load_mirrors() + categories = sorted({m["category"] for m in mirrors}) + # format last_updated nicely + rows = [] + for m in mirrors: + last_disp = None + raw = m.get("last_updated") + if raw: + last_disp = raw.replace("T", " ").replace("Z", " UTC") + rows.append({ + "slug": m["slug"], + "category": m["category"], + "url": m["url"], + "status": m.get("status") or "idle", + "last_updated_raw": raw, + "last_updated": last_disp, + }) + return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=categories, error=None) + + +@app.route("/add", methods=["POST"]) +def add_mirror_route(): + slug = (request.form.get("slug") or "").strip() + category = (request.form.get("category") or "").strip() + url = (request.form.get("url") or "").strip() + ignore_robots = bool(request.form.get("ignore_robots")) + + error = None + if not slug or not category or not url: + error = "Slug, category, and URL are required." + elif " " in slug: + error = "Slug cannot contain spaces." + if error: + mirrors = load_mirrors() + categories = sorted({m["category"] for m in mirrors}) + rows = [] + for m in mirrors: + raw = m.get("last_updated") + last_disp = raw.replace("T", " ").replace( + "Z", " UTC") if raw else None + rows.append({ + "slug": m["slug"], + "category": m["category"], + "url": m["url"], + "status": m.get("status") or "idle", + "last_updated_raw": raw, + "last_updated": last_disp, + }) + return render_template_string(INDEX_TEMPLATE, mirrors=rows, categories=categories, error=error), 400 + + try: + add_mirror(slug, category, url, ignore_robots=ignore_robots) + except Exception as e: + mirrors = load_mirrors() + categories = sorted({m["category"] for m in mirrors}) + rows = [] + for m in mirrors: + raw = m.get("last_updated") + last_disp = raw.replace("T", " ").replace( + "Z", " UTC") if raw else None + rows.append({ + "slug": m["slug"], + "category": m["category"], + "url": m["url"], + "status": m.get("status") or "idle", + "last_updated_raw": raw, + "last_updated": last_disp, + }) + return render_template_string(INDEX_TEMPLATE, + mirrors=rows, + categories=categories, + error=str(e)), 400 + + # kick off background update + _run_update_in_background(slug) + return redirect(url_for("index")) + + +@app.route("/update/", methods=["POST"]) +def trigger_update(slug): + # fire-and-forget; UI will see status flip to 'updating' + _run_update_in_background(slug) + return redirect(url_for("index")) + + +@app.route("/status", methods=["GET"]) +def status(): + mirrors = load_mirrors() + out = [] + for m in mirrors: + raw = m.get("last_updated") + last_disp = raw.replace("T", " ").replace("Z", " UTC") if raw else None + out.append({ + "slug": m["slug"], + "category": m["category"], + "url": m["url"], + "status": m.get("status") or "idle", + "last_updated": raw, + "last_updated_display": last_disp or "", + }) + return jsonify({"mirrors": out}) + + +@app.route("/logs/") +def view_log(slug): + log_path = LOG_ROOT / f"{slug}.log" + if not log_path.exists(): + abort(404) + text = log_path.read_text(encoding="utf-8", errors="replace") + return "
" + (text.replace("&", "&").replace("<", "<")) + "
" + + +if __name__ == "__main__": + app.run(host="127.0.0.1", port=5000, debug=False) diff --git a/data/mirrors.json b/data/mirrors.json new file mode 100644 index 0000000..bced335 --- /dev/null +++ b/data/mirrors.json @@ -0,0 +1,12 @@ +[ + { + "slug": "wgpu-tutorial", + "category": "rust", + "url": "https://sotrh.github.io/learn-wgpu/", + "ignore_robots": false, + "created_at": "2025-12-02T07:15:12Z", + "last_updated": null, + "status": "error", + "last_error": "wget exited with 4" + } +] \ No newline at end of file diff --git a/mirror_manager.py b/mirror_manager.py new file mode 100755 index 0000000..6c13b60 --- /dev/null +++ b/mirror_manager.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Manage the various mirrors for the mirror website. +""" + +import json +import subprocess +import datetime as dt +from pathlib import Path + +BASE = Path("/srv/www") +DATA_FILE = BASE / "data" / "mirrors.json" +MIRROR_ROOT = BASE / "mirrors" +LOG_ROOT = BASE / "logs" + +MIRROR_ROOT.mkdir(parents=True, exist_ok=True) +LOG_ROOT.mkdir(parents=True, exist_ok=True) +DATA_FILE.parent.mkdir(parents=True, exist_ok=True) + + +def _now_iso() -> str: + return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + + +def load_mirrors() -> list[dict]: + if not DATA_FILE.exists(): + return [] + with DATA_FILE.open("r", encoding="utf-8") as f: + return json.load(f) + + +def save_mirrors(mirrors: list[dict]) -> None: + tmp = DATA_FILE.with_suffix(".tmp") + with tmp.open("w", encoding="utf-8") as f: + json.dump(mirrors, f, indent=2) + tmp.replace(DATA_FILE) + + +def get_mirror(mirrors: list[dict], slug: str) -> dict | None: + for m in mirrors: + if m["slug"] == slug: + return m + return None + + +def add_mirror(slug: str, + category: str, + url: str, + ignore_robots: bool = False) -> dict: + mirrors = load_mirrors() + if get_mirror(mirrors, slug) is not None: + raise ValueError(f"Mirror with slug '{slug}' already exists!") + + m = { + "slug": slug, + "category": category, + "url": url, + "ignore_robots": bool(ignore_robots), + "created_at": _now_iso(), + "last_updated": None, + "status": "queued", + "last_error": None, + } + + mirrors.append(m) + save_mirrors(mirrors) + return m + + +def update_mirror(slug: str) -> None: + """Run wget mirror for a singel slug (blocking).""" + mirrors = load_mirrors() + m = get_mirror(mirrors, slug) + if m is None: + raise ValueError(f"No such mirror: {slug}") + + m["status"] = "updating" + m["last_error"] = None + save_mirrors(mirrors) + + target_dir = MIRROR_ROOT / slug + target_dir.mkdir(parents=True, exist_ok=True) + log_file = LOG_ROOT / f"{slug}.log" + + robots_setting = "off" if m.get("ignore_robots") else "on" + + # Polite wget: + # --mirror implies -r -N -l inf --no-remove-listing + cmd = [ + "wget", + "--mirror", + "--convert-links", + "--adjust-extension", + "--page-requisites", + "--no-parent", + "--wait=0.70", + "--random-wait", + # "--limit-rate=50m", + f"execute=robots={robots_setting}", + "-P", + str(target_dir), + m["url"], + ] + + try: + with log_file.open("a", encoding="utf-8") as lf: + lf.write(f"\n=== {_now_iso()} : " + f"Starting mirror of {m['url']} ===\n") + lf.flush() + subprocess.run( + cmd, + stdout=lf, + stderr=subprocess.STDOUT, + check=True, + ) + lf.write(f"=== {_now_iso()} : Completed mirror of {m['url']} ===\n") + lf.flush() + m["last_updated"] = _now_iso() + m["status"] = "idle" + m["last_error"] = None + except subprocess.CalledProcessError as e: + m["status"] = "error" + m["last_error"] = f"wget exited with {e.returncode}" + with log_file.open("a", encoding="utf-8") as lf: + lf.write(f"*** ERROR: wget failed with code {e.returncode}\n") + except Exception as e: + m["status"] = "error" + m["last_error"] = f"{type(e).__name__}: {e}" + with log_file.open("a", encoding="utf-8") as lf: + lf.write(f"*** ERROR: {type(e).__name__}: {e}\n") + finally: + save_mirrors(mirrors) + + +def update_all_mirrors() -> None: + mirrors = load_mirrors() + for m in mirrors: + update_mirror(m["slug"]) diff --git a/mirrors.txt b/mirrors.txt deleted file mode 100644 index f65be1b..0000000 --- a/mirrors.txt +++ /dev/null @@ -1,2 +0,0 @@ -# Slug URL -wgpu-tutorial https://sotrh.github.io/learn-wgpu/ diff --git a/update_mirrors.py b/update_mirrors.py new file mode 100755 index 0000000..f723cb6 --- /dev/null +++ b/update_mirrors.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import sys +from mirror_manager import update_all_mirrors, update_mirror + + +def main(): + if len(sys.argv) == 2: + slug = sys.argv[1] + update_mirror(slug) + else: + update_all_mirrors() + + +if __name__ == "__main__": + main() diff --git a/update_mirrors.sh b/update_mirrors.sh deleted file mode 100755 index 9ccbe09..0000000 --- a/update_mirrors.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -BASE="/srv/www" -URL_LIST="$BASE/mirrors.txt" -OUTDIR="$BASE/mirrors" - -mkdir -p "$OUTDIR" - -# If a slug is passed as an argument, only update that one. -ONLY_SLUG="${1:-}" - -while read -r slug url; do - # skip empty lines & comments - [ -z "${slug:-}" ] && continue - [[ "$slug" =~ ^# ]] && continue - - if [ -n "$ONLY_SLUG" ] && [ "$slug" != "$ONLY_SLUG" ]; then - continue - fi - - echo "=== Mirroring $slug ($url) ===" - - # Each mirror in its own directory - TARGET_DIR="$OUTDIR/$slug" - mkdir -p "$TARGET_DIR" - cd "$TARGET_DIR" - - # Mirror site - wget \ - --mirror \ - --convert-links \ - --adjust-extension \ - --page-requisites \ - --no-parent \ - "$url" - - echo "=== Done $slug ===" -done < "$URL_LIST" - -# Regenerate index page -cd "$BASE" -python3 "$BASE/generate_index.py"