True search fix
This commit is contained in:
88
app.py
88
app.py
@@ -6,6 +6,8 @@ from mirror_manager import (
|
||||
MIRROR_ROOT,
|
||||
LOG_ROOT,
|
||||
)
|
||||
import re
|
||||
import html
|
||||
import subprocess
|
||||
import threading
|
||||
from pathlib import Path
|
||||
@@ -509,41 +511,67 @@ def log_tail(slug):
|
||||
return "", 200
|
||||
|
||||
|
||||
def strip_html(text: str) -> str:
|
||||
# Remove script and style blocks first
|
||||
text = re.sub(
|
||||
r"<script\b[^<]*(?:(?!</script>)<[^<]*)*</script>",
|
||||
" ",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
text = re.sub(
|
||||
r"<style\b[^<]*(?:(?!</style>)<[^<]*)*</style>",
|
||||
" ",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
# Strip all remaining tags
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
# Unescape HTML entities (& → &, etc.)
|
||||
text = html.unescape(text)
|
||||
# Collapse whitespace
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def make_snippet(text: str,
|
||||
query: str,
|
||||
radius: int = 80,
|
||||
max_len: int = 240) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
lower = text.lower()
|
||||
qlower = query.lower()
|
||||
idx = lower.find(qlower)
|
||||
if idx == -1:
|
||||
snippet = text[:max_len]
|
||||
if len(text) > max_len:
|
||||
snippet += "…"
|
||||
return snippet
|
||||
start = max(0, idx - radius)
|
||||
end = min(len(text), idx + len(query) + radius)
|
||||
snippet = text[start:end]
|
||||
if start > 0:
|
||||
snippet = "…" + snippet
|
||||
if end < len(text):
|
||||
snippet += "…"
|
||||
return snippet
|
||||
|
||||
|
||||
@app.route("/search", methods=["GET"])
|
||||
def content_search():
|
||||
q = (request.args.get("q") or "").strip()
|
||||
if not q:
|
||||
return jsonify({"results": []})
|
||||
|
||||
def make_snippet(text: str, query: str, radius: int = 80, max_len: int = 240) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
lower = text.lower()
|
||||
qlower = query.lower()
|
||||
idx = lower.find(qlower)
|
||||
if idx == -1:
|
||||
snippet = text[:max_len]
|
||||
if len(text) > max_len:
|
||||
snippet += "…"
|
||||
return snippet
|
||||
start = max(0, idx - radius)
|
||||
end = min(len(text), idx + len(query) + radius)
|
||||
snippet = text[start:end]
|
||||
if start > 0:
|
||||
snippet = "…" + snippet
|
||||
if end < len(text):
|
||||
snippet += "…"
|
||||
return snippet
|
||||
|
||||
try:
|
||||
# Only search "page-like" files: html / md / txt
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"rg",
|
||||
"--line-number",
|
||||
"--no-heading",
|
||||
"--color", "never",
|
||||
"--max-count", "5", # max 5 hits per file
|
||||
"--max-count", "5", # per file
|
||||
"--type-add", "page:*.{html,htm,md,markdown,txt}",
|
||||
"-tpage",
|
||||
q,
|
||||
@@ -578,18 +606,20 @@ def content_search():
|
||||
parts = line.split(":", 2)
|
||||
if len(parts) != 3:
|
||||
continue
|
||||
path, lineno, content = parts
|
||||
path, lineno, raw_content = parts
|
||||
|
||||
# Strip HTML/JS/CSS markup from this line before making a snippet
|
||||
text_content = strip_html(raw_content)
|
||||
if not text_content:
|
||||
continue
|
||||
|
||||
snippet = make_snippet(text_content, q)
|
||||
|
||||
try:
|
||||
rel_path = str(Path(path).relative_to(MIRROR_ROOT))
|
||||
except ValueError:
|
||||
# Shouldn't happen, but be defensive
|
||||
rel_path = path
|
||||
|
||||
# Short text snippet around the query
|
||||
snippet = make_snippet(content, q)
|
||||
|
||||
# Build a URL that opens the mirrored page in the browser
|
||||
# Assuming nginx serves /srv/www/mirrors as /mirrors/
|
||||
url = "/mirrors/" + rel_path.replace("\\", "/")
|
||||
|
||||
results.append({
|
||||
|
||||
Reference in New Issue
Block a user