True search fix

2025-12-02 03:17:49 -05:00
parent 5ac1549567
commit ab8c6be6e5
1 changed files with 59 additions and 29 deletions
--- a/app.py
+++ b/app.py
@@ -6,6 +6,8 @@ from mirror_manager import (
    MIRROR_ROOT,
    LOG_ROOT,
 )
+import re
+import html
 import subprocess
 import threading
 from pathlib import Path
@@ -509,41 +511,67 @@ def log_tail(slug):
        return "", 200


+def strip_html(text: str) -> str:
+    # Remove script and style blocks first
+    text = re.sub(
+        r"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>",
+        " ",
+        text,
+        flags=re.IGNORECASE,
+    )
+    text = re.sub(
+        r"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>",
+        " ",
+        text,
+        flags=re.IGNORECASE,
+    )
+    # Strip all remaining tags
+    text = re.sub(r"<[^>]+>", " ", text)
+    # Unescape HTML entities (&amp; → &, etc.)
+    text = html.unescape(text)
+    # Collapse whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def make_snippet(text: str,
+                 query: str,
+                 radius: int = 80,
+                 max_len: int = 240) -> str:
+    if not text:
+        return ""
+    lower = text.lower()
+    qlower = query.lower()
+    idx = lower.find(qlower)
+    if idx == -1:
+        snippet = text[:max_len]
+        if len(text) > max_len:
+            snippet += "…"
+        return snippet
+    start = max(0, idx - radius)
+    end = min(len(text), idx + len(query) + radius)
+    snippet = text[start:end]
+    if start > 0:
+        snippet = "…" + snippet
+    if end < len(text):
+        snippet += "…"
+    return snippet
+
+
@app.route("/search", methods=["GET"])
 def content_search():
    q = (request.args.get("q") or "").strip()
    if not q:
        return jsonify({"results": []})

-    def make_snippet(text: str, query: str, radius: int = 80, max_len: int = 240) -> str:
-        if not text:
-            return ""
-        lower = text.lower()
-        qlower = query.lower()
-        idx = lower.find(qlower)
-        if idx == -1:
-            snippet = text[:max_len]
-            if len(text) > max_len:
-                snippet += "…"
-            return snippet
-        start = max(0, idx - radius)
-        end = min(len(text), idx + len(query) + radius)
-        snippet = text[start:end]
-        if start > 0:
-            snippet = "…" + snippet
-        if end < len(text):
-            snippet += "…"
-        return snippet
-
    try:
-        # Only search "page-like" files: html / md / txt
        proc = subprocess.run(
            [
                "rg",
                "--line-number",
                "--no-heading",
                "--color", "never",
-                "--max-count", "5",  # max 5 hits per file
+                "--max-count", "5",  # per file
                "--type-add", "page:*.{html,htm,md,markdown,txt}",
                "-tpage",
                q,
@@ -578,18 +606,20 @@ def content_search():
        parts = line.split(":", 2)
        if len(parts) != 3:
            continue
-        path, lineno, content = parts
+        path, lineno, raw_content = parts
+
+        # Strip HTML/JS/CSS markup from this line before making a snippet
+        text_content = strip_html(raw_content)
+        if not text_content:
+            continue
+
+        snippet = make_snippet(text_content, q)
+
        try:
            rel_path = str(Path(path).relative_to(MIRROR_ROOT))
        except ValueError:
-            # Shouldn't happen, but be defensive
            rel_path = path

-        # Short text snippet around the query
-        snippet = make_snippet(content, q)
-
-        # Build a URL that opens the mirrored page in the browser
-        # Assuming nginx serves /srv/www/mirrors as /mirrors/
        url = "/mirrors/" + rel_path.replace("\\", "/")

        results.append({