True search fix

This commit is contained in:
2025-12-02 03:17:49 -05:00
parent 5ac1549567
commit ab8c6be6e5

88
app.py
View File

@@ -6,6 +6,8 @@ from mirror_manager import (
MIRROR_ROOT,
LOG_ROOT,
)
import re
import html
import subprocess
import threading
from pathlib import Path
@@ -509,41 +511,67 @@ def log_tail(slug):
return "", 200
def strip_html(text: str) -> str:
# Remove script and style blocks first
text = re.sub(
r"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>",
" ",
text,
flags=re.IGNORECASE,
)
text = re.sub(
r"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>",
" ",
text,
flags=re.IGNORECASE,
)
# Strip all remaining tags
text = re.sub(r"<[^>]+>", " ", text)
# Unescape HTML entities (&amp; → &, etc.)
text = html.unescape(text)
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
def make_snippet(text: str,
query: str,
radius: int = 80,
max_len: int = 240) -> str:
if not text:
return ""
lower = text.lower()
qlower = query.lower()
idx = lower.find(qlower)
if idx == -1:
snippet = text[:max_len]
if len(text) > max_len:
snippet += ""
return snippet
start = max(0, idx - radius)
end = min(len(text), idx + len(query) + radius)
snippet = text[start:end]
if start > 0:
snippet = "" + snippet
if end < len(text):
snippet += ""
return snippet
@app.route("/search", methods=["GET"])
def content_search():
q = (request.args.get("q") or "").strip()
if not q:
return jsonify({"results": []})
def make_snippet(text: str, query: str, radius: int = 80, max_len: int = 240) -> str:
if not text:
return ""
lower = text.lower()
qlower = query.lower()
idx = lower.find(qlower)
if idx == -1:
snippet = text[:max_len]
if len(text) > max_len:
snippet += ""
return snippet
start = max(0, idx - radius)
end = min(len(text), idx + len(query) + radius)
snippet = text[start:end]
if start > 0:
snippet = "" + snippet
if end < len(text):
snippet += ""
return snippet
try:
# Only search "page-like" files: html / md / txt
proc = subprocess.run(
[
"rg",
"--line-number",
"--no-heading",
"--color", "never",
"--max-count", "5", # max 5 hits per file
"--max-count", "5", # per file
"--type-add", "page:*.{html,htm,md,markdown,txt}",
"-tpage",
q,
@@ -578,18 +606,20 @@ def content_search():
parts = line.split(":", 2)
if len(parts) != 3:
continue
path, lineno, content = parts
path, lineno, raw_content = parts
# Strip HTML/JS/CSS markup from this line before making a snippet
text_content = strip_html(raw_content)
if not text_content:
continue
snippet = make_snippet(text_content, q)
try:
rel_path = str(Path(path).relative_to(MIRROR_ROOT))
except ValueError:
# Shouldn't happen, but be defensive
rel_path = path
# Short text snippet around the query
snippet = make_snippet(content, q)
# Build a URL that opens the mirrored page in the browser
# Assuming nginx serves /srv/www/mirrors as /mirrors/
url = "/mirrors/" + rel_path.replace("\\", "/")
results.append({