mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
Refactor to use crawl4ai, uv
This commit is contained in:
47
app/routers/bookmarks.py
Normal file
47
app/routers/bookmarks.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List, Optional
|
||||
|
||||
from ..database import get_db, Bookmark
|
||||
from ..utils import serialize_bookmark
|
||||
from ..logging_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
|
||||
|
||||
@router.get("/search")
|
||||
async def search_bookmarks(
|
||||
domain: Optional[str] = Query(None),
|
||||
folder: Optional[str] = Query(None),
|
||||
search_term: Optional[str] = Query(None),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""Search bookmarks with optimized queries"""
|
||||
try:
|
||||
# Build query efficiently
|
||||
query = db.query(Bookmark)
|
||||
|
||||
# Apply filters using index-optimized queries
|
||||
if domain:
|
||||
query = query.filter(Bookmark.domain == domain)
|
||||
|
||||
if folder:
|
||||
query = query.filter(Bookmark.folder == folder)
|
||||
|
||||
if search_term:
|
||||
# Use LIKE for title search (consider FTS for bookmarks if needed)
|
||||
search_pattern = f"%{search_term}%"
|
||||
query = query.filter(Bookmark.title.ilike(search_pattern))
|
||||
# Removed index hint as SQLAlchemy/SQLite usually handles this well with LIKE
|
||||
|
||||
# Add ordering and limit for better performance
|
||||
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
|
||||
|
||||
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Bookmark search error: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={"message": "Bookmark search operation failed", "error": str(e)}
|
||||
)
|
||||
43
app/routers/config.py
Normal file
43
app/routers/config.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from typing import List
|
||||
|
||||
from ..config import Config
|
||||
from ..logging_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
router = APIRouter(prefix="/config", tags=["config"])
|
||||
|
||||
# Assuming config is a singleton or easily accessible
|
||||
# If not, you might need to use Depends or app state
|
||||
config = Config()
|
||||
|
||||
@router.get("/ignored-domains")
|
||||
async def get_ignored_domains():
|
||||
"""Get list of ignored domain patterns"""
|
||||
try:
|
||||
return {"ignored_domains": config.config.get('ignored_domains', [])}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting ignored domains: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve ignored domains")
|
||||
|
||||
|
||||
@router.post("/ignored-domains")
|
||||
async def add_ignored_domain(pattern: str):
|
||||
"""Add a new domain pattern to ignored list"""
|
||||
try:
|
||||
config.add_ignored_domain(pattern)
|
||||
return {"status": "success", "message": f"Added pattern: {pattern}"}
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding ignored domain '{pattern}': {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Failed to add ignored domain")
|
||||
|
||||
|
||||
@router.delete("/ignored-domains/{pattern}")
|
||||
async def remove_ignored_domain(pattern: str):
|
||||
"""Remove a domain pattern from ignored list"""
|
||||
try:
|
||||
config.remove_ignored_domain(pattern)
|
||||
return {"status": "success", "message": f"Removed pattern: {pattern}"}
|
||||
except Exception as e:
|
||||
logger.error(f"Error removing ignored domain '{pattern}': {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Failed to remove ignored domain")
|
||||
132
app/routers/history.py
Normal file
132
app/routers/history.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from typing import List, Optional
|
||||
|
||||
from ..database import get_db, HistoryEntry
|
||||
from ..utils import serialize_history_entry
|
||||
from ..logging_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
router = APIRouter(prefix="/history", tags=["history"])
|
||||
|
||||
@router.get("/search")
|
||||
async def search_history(
|
||||
query: Optional[str] = Query(None),
|
||||
domain: Optional[str] = Query(None),
|
||||
start_date: Optional[str] = Query(None),
|
||||
end_date: Optional[str] = Query(None),
|
||||
include_content: bool = Query(False),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""Search history using FTS5"""
|
||||
try:
|
||||
if query:
|
||||
# Build the FTS query
|
||||
# Basic query sanitization/escaping might be needed depending on FTS syntax usage
|
||||
# For simple term search, this is okay. For complex FTS syntax, more care is needed.
|
||||
fts_conditions = []
|
||||
params = {}
|
||||
|
||||
# Handle different query parts (title, content, domain)
|
||||
# Example: "term1 title:term2 domain:example.com"
|
||||
# This requires more sophisticated parsing. For now, assume simple query applies to title/content.
|
||||
# A safer approach for user input:
|
||||
sanitized_query = query.replace('"', '""') # Basic FTS escaping for quotes
|
||||
fts_match_expr = f'(title : "{sanitized_query}"* OR markdown_content : "{sanitized_query}"*)'
|
||||
params['fts_query'] = fts_match_expr
|
||||
|
||||
if domain:
|
||||
# Add domain filtering directly in FTS if possible and indexed
|
||||
# Assuming 'domain' is an indexed column in FTS table
|
||||
# params['fts_query'] += f' AND domain : "{domain}"' # Adjust FTS syntax if needed
|
||||
# Or filter after FTS search if domain isn't in FTS index efficiently
|
||||
pass # Domain filtering will be added later if needed
|
||||
|
||||
# Build the SQL query
|
||||
sql = """
|
||||
SELECT
|
||||
h.*,
|
||||
bm25(history_fts) as rank,
|
||||
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||
FROM history_fts
|
||||
JOIN history h ON history_fts.rowid = h.id
|
||||
WHERE history_fts MATCH :fts_query
|
||||
"""
|
||||
|
||||
# Add domain filter as a regular WHERE clause if not in FTS MATCH
|
||||
if domain:
|
||||
sql += " AND h.domain = :domain"
|
||||
params['domain'] = domain
|
||||
|
||||
# Add date filters if provided
|
||||
if start_date:
|
||||
sql += " AND h.visit_time >= :start_date"
|
||||
params['start_date'] = start_date
|
||||
if end_date:
|
||||
sql += " AND h.visit_time <= :end_date"
|
||||
params['end_date'] = end_date
|
||||
|
||||
sql += " ORDER BY rank DESC, h.visit_time DESC LIMIT 100" # Rank usually descends
|
||||
|
||||
results = db.execute(text(sql), params).fetchall()
|
||||
# Use the updated serializer that handles potential highlight/rank fields
|
||||
return [serialize_history_entry(row, include_content) for row in results]
|
||||
|
||||
else:
|
||||
# Handle non-search queries (basic filtering)
|
||||
query_builder = db.query(HistoryEntry)
|
||||
|
||||
if domain:
|
||||
query_builder = query_builder.filter(HistoryEntry.domain == domain)
|
||||
if start_date:
|
||||
query_builder = query_builder.filter(HistoryEntry.visit_time >= start_date)
|
||||
if end_date:
|
||||
query_builder = query_builder.filter(HistoryEntry.visit_time <= end_date)
|
||||
|
||||
entries = query_builder.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
||||
return [serialize_history_entry(entry, include_content) for entry in entries]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search error: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={"message": "Search operation failed", "error": str(e)}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/search/advanced")
|
||||
async def advanced_history_search(
|
||||
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
|
||||
include_content: bool = Query(False),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""Advanced full-text search using SQLite FTS5 features"""
|
||||
try:
|
||||
# Use raw SQL for advanced FTS query
|
||||
# Add rank and highlights here as well
|
||||
fts_query = """
|
||||
SELECT
|
||||
h.*,
|
||||
bm25(history_fts) as rank,
|
||||
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||
FROM history_fts
|
||||
JOIN history h ON history_fts.rowid = h.id
|
||||
WHERE history_fts MATCH :query
|
||||
ORDER BY rank DESC, h.visit_time DESC
|
||||
LIMIT 1000
|
||||
"""
|
||||
|
||||
results = db.execute(text(fts_query), {'query': query}).fetchall()
|
||||
|
||||
# Use the updated serializer
|
||||
return [serialize_history_entry(row, include_content) for row in results]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Advanced search error: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={"message": "Advanced search operation failed", "error": str(e)}
|
||||
)
|
||||
52
app/routers/ui.py
Normal file
52
app/routers/ui.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ..database import get_db, HistoryEntry, Bookmark
|
||||
from ..logging_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
router = APIRouter(tags=["ui"])
|
||||
templates = Jinja2Templates(directory="app/templates")
|
||||
|
||||
@router.get("/")
|
||||
async def home(request: Request, db: Session = Depends(get_db)):
|
||||
try:
|
||||
# Get recent history entries
|
||||
entries = db.query(HistoryEntry)\
|
||||
.order_by(HistoryEntry.visit_time.desc())\
|
||||
.limit(50)\
|
||||
.all()
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
{"request": request, "entries": entries}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading home page: {e}", exc_info=True)
|
||||
# Optionally return an error template
|
||||
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load history"})
|
||||
|
||||
|
||||
@router.get("/search")
|
||||
async def search_page(request: Request):
|
||||
return templates.TemplateResponse(
|
||||
"search.html",
|
||||
{"request": request}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/bookmarks")
|
||||
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
|
||||
try:
|
||||
bookmarks = db.query(Bookmark)\
|
||||
.order_by(Bookmark.added_time.desc())\
|
||||
.limit(50)\
|
||||
.all()
|
||||
return templates.TemplateResponse(
|
||||
"bookmarks.html",
|
||||
{"request": request, "bookmarks": bookmarks}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading bookmarks page: {e}", exc_info=True)
|
||||
# Optionally return an error template
|
||||
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load bookmarks"})
|
||||
175
app/routers/websocket.py
Normal file
175
app/routers/websocket.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import asyncio
|
||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from urllib.parse import urlparse
|
||||
import iso8601
|
||||
|
||||
# Import necessary components from other modules
|
||||
from .. import main as app_main # To access global crawler instance
|
||||
from ..database import get_db, HistoryEntry
|
||||
from ..config import Config
|
||||
from ..logging_config import setup_logger
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
router = APIRouter(tags=["websocket"])
|
||||
config = Config() # Assuming config is okay as a separate instance here
|
||||
|
||||
@router.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
||||
# Access the global crawler instance from main.py
|
||||
crawler = app_main.crawler
|
||||
if not crawler:
|
||||
logger.error("Crawler not initialized!")
|
||||
await websocket.close(code=1011) # Internal Server Error
|
||||
return
|
||||
|
||||
logger.info("New WebSocket connection established")
|
||||
await websocket.accept()
|
||||
try:
|
||||
while True:
|
||||
data = await websocket.receive_json()
|
||||
|
||||
# Validate incoming data structure (basic check)
|
||||
if 'url' not in data or 'timestamp' not in data:
|
||||
logger.warning("Received invalid WebSocket message format.")
|
||||
await websocket.send_json({
|
||||
"status": "error",
|
||||
"message": "Invalid message format. 'url' and 'timestamp' required."
|
||||
})
|
||||
continue
|
||||
|
||||
url = data['url']
|
||||
try:
|
||||
timestamp = iso8601.parse_date(data['timestamp'])
|
||||
except iso8601.ParseError:
|
||||
logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
|
||||
await websocket.send_json({
|
||||
"status": "error",
|
||||
"message": f"Invalid timestamp format: {data['timestamp']}"
|
||||
})
|
||||
continue
|
||||
|
||||
# Parse the URL and check if domain should be ignored
|
||||
try:
|
||||
domain = urlparse(url).netloc
|
||||
if not domain: # Handle invalid URLs
|
||||
raise ValueError("Could not parse domain from URL")
|
||||
except ValueError as e:
|
||||
logger.warning(f"Could not parse URL: {url}. Error: {e}")
|
||||
await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
|
||||
continue
|
||||
|
||||
if config.is_domain_ignored(domain):
|
||||
logger.info(f"Ignoring domain: {domain} for URL: {url}")
|
||||
await websocket.send_json({
|
||||
"status": "ignored",
|
||||
"message": f"Domain {domain} is in ignore list"
|
||||
})
|
||||
continue
|
||||
|
||||
logger.info(f"Processing page via WebSocket: {url}")
|
||||
|
||||
# Check if we already have a recent entry for this URL
|
||||
# Make timestamp timezone-aware (assuming UTC if naive)
|
||||
if timestamp.tzinfo is None:
|
||||
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
timestamp = timestamp.astimezone(timezone.utc)
|
||||
|
||||
recent_threshold = timestamp - timedelta(minutes=5)
|
||||
existing_entry = db.query(HistoryEntry.id).filter(
|
||||
HistoryEntry.url == url,
|
||||
HistoryEntry.visit_time >= recent_threshold
|
||||
).first() # Only fetch ID for efficiency
|
||||
|
||||
if existing_entry:
|
||||
logger.info(f"Recent entry exists for URL: {url}")
|
||||
await websocket.send_json({
|
||||
"status": "skipped",
|
||||
"message": "Recent entry exists"
|
||||
})
|
||||
continue
|
||||
|
||||
# --- Start crawl4ai processing ---
|
||||
logger.info(f"Processing page with crawl4ai: {url}")
|
||||
markdown_content = None
|
||||
title = ''
|
||||
try:
|
||||
# Use the global crawler instance
|
||||
crawl_result = await crawler.arun(url=url)
|
||||
if crawl_result:
|
||||
markdown_content = crawl_result.markdown
|
||||
# Attempt to get title from metadata, fallback to empty string
|
||||
title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
|
||||
if not title:
|
||||
logger.warning(f"Could not extract title for {url} using crawl4ai.")
|
||||
logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
|
||||
else:
|
||||
logger.warning(f"crawl4ai returned None for URL: {url}")
|
||||
markdown_content = "" # Ensure it's not None
|
||||
title = ""
|
||||
|
||||
except Exception as crawl_error:
|
||||
logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
|
||||
await websocket.send_json({
|
||||
"status": "error",
|
||||
"message": f"Failed to crawl page content: {str(crawl_error)}"
|
||||
})
|
||||
continue # Skip to next message
|
||||
# --- End crawl4ai processing ---
|
||||
|
||||
# Only proceed if we got some content or at least a title
|
||||
if not title and not markdown_content:
|
||||
logger.info(f"No title or content extracted by crawl4ai from: {url}")
|
||||
await websocket.send_json({
|
||||
"status": "skipped",
|
||||
"message": "No title or content extracted by crawl4ai"
|
||||
})
|
||||
continue
|
||||
|
||||
# Create history entry using data from crawl4ai
|
||||
history_entry = HistoryEntry(
|
||||
url=url,
|
||||
title=title, # Use title from crawl4ai
|
||||
visit_time=timestamp, # Use the parsed, timezone-aware timestamp
|
||||
domain=domain,
|
||||
markdown_content=markdown_content, # Use markdown from crawl4ai
|
||||
last_content_update=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
|
||||
|
||||
db.add(history_entry)
|
||||
try:
|
||||
db.commit()
|
||||
logger.info(f"Successfully saved entry for: {url}")
|
||||
await websocket.send_json({
|
||||
"status": "success",
|
||||
"message": f"Processed page: {url}"
|
||||
})
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
|
||||
await websocket.send_json({
|
||||
"status": "error",
|
||||
"message": "Database error occurred while saving."
|
||||
})
|
||||
|
||||
except WebSocketDisconnect:
|
||||
logger.info("WebSocket client disconnected")
|
||||
except Exception as e:
|
||||
logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
|
||||
# Attempt to inform client before closing (might fail if connection is already broken)
|
||||
try:
|
||||
await websocket.send_json({
|
||||
"status": "error",
|
||||
"message": "An internal server error occurred."
|
||||
})
|
||||
except Exception:
|
||||
pass # Ignore if sending fails
|
||||
# Ensure connection is closed on server error
|
||||
try:
|
||||
await websocket.close(code=1011) # Internal Server Error
|
||||
except Exception:
|
||||
pass # Ignore if closing fails
|
||||
Reference in New Issue
Block a user